Lanme/keras_bilstm.py

## keras_bilstm.py
#forked from https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-069
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint

max_features = 20000
maxlen = 100


train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
train = train.sample(frac=1)

list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values


tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(6, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


model = get_model()
batch_size = 32
epochs = 2


file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

model.load_weights(file_path)

y_test = model.predict(X_te)


sample_submission = pd.read_csv("../input/sample_submission.csv")

sample_submission[list_classes] = y_test


sample_submission.to_csv("baseline.csv", index=False)

## keras_cnn_text_classicification.py

"""
本文主要使用一维卷积核的CNN进行文本分类（二维卷积主要用于图像处理）
步骤 1：测试文本的预处理，分词->去去除停用词->统计选择top 5000的词做为特征词
步骤 2：为每个特征词生成ID
步骤 3：将文本转化成ID序列，并将左侧补齐
步骤 4：训练集shuffle
步骤 5：Embedding Layer 将词转化为词向量
步骤 6：添加Conv1D卷积层
步骤 7：添加池化层
步骤 7：添加全连接层，loss function = binary_crossentropy
步骤 8：输出层使用Sigmoid
from:https://blog.csdn.net/xiewenbo/article/details/77874080
"""

from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
from keras.models import model_from_json
import numpy as np
# set parameters:
max_features = 5001
maxlen = 100
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10


x_train=np.loadtxt("x_train.txt",dtype=int)
y_train=np.loadtxt("y_train.txt",dtype=int)

indices = np.arange(x_train.shape[0])
np.random.shuffle(indices)
x_train = x_train[indices]
y_train = y_train[indices]

print('Loading data...')
#x_train=np.loadtxt("x_train.txt",dtype=int)
#y_train=np.loadtxt("y_train.txt",dtype=int)
x_test=x_train[20000:]
y_test=y_train[20000:]
x_train=x_train[:20000]
y_train=y_train[:20000]
#x_test=x_train
#y_test=y_train
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print(x_train[:1])
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.5))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))

model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model.h5")
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
score = loaded_model.evaluate(x_test, y_test, verbose=0)
print(score)


## tf_rnn.py
# coding: utf-8

# In[1]:

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data


# In[2]:

#载入数据集
mnist = input_data.read_data_sets("MNIST_data/",one_hot=True)

# 输入图片是28*28
n_inputs = 28 #输入一行，一行有28个数据
max_time = 28 #一共28行
lstm_size = 500 #隐层单元
n_classes = 10 # 10个分类
batch_size = 50 #每批次50个样本
n_batch = mnist.train.num_examples // batch_size #计算一共有多少个批次
print(n_batch)
#这里的none表示第一个维度可以是任意的长度
x = tf.placeholder(tf.float32,[None,784])
#正确的标签
y = tf.placeholder(tf.float32,[None,10])

#初始化权值
weights = tf.Variable(tf.truncated_normal([lstm_size, n_classes], stddev=0.1))
#初始化偏置值
biases = tf.Variable(tf.constant(0.1, shape=[n_classes]))


#定义RNN网络
def RNN(X,weights,biases):
    # inputs=[batch_size, max_time, n_inputs]
    inputs = tf.reshape(X,[-1,max_time,n_inputs])
    #定义LSTM基本CELL
    lstm_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(lstm_size)
#       final_state[0]是cell state
#       final_state[1]是hidden_state  最后500个隐藏单元的输出结果，
#       output 与time.major 如果是false 返回是 batch_size=50次, maxtime=28的长度，cell.output_size:500个隐藏单元
#                                 但是time是 0到27，则cell.output_size 则表示对应时间的500个输出结果。
#       finale_state返回：
#           state【】  包括 cell_state     中间的celll，
#                           hidden_state   最后的结果输出。
#           batch_size  50次
#           state_size: 隐藏单元个数 500

    #    隐藏层的单元个数， （batch次数，个数，数据数），格式
    outputs,final_state = tf.nn.dynamic_rnn(lstm_cell,inputs,dtype=tf.float32)
    print(outputs[2])
    results = tf.nn.softmax(tf.matmul(final_state[1],weights) + biases)
    return results


#计算RNN的返回结果
prediction= RNN(x, weights, biases)
#损失函数
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
#使用AdamOptimizer进行优化
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
#结果存放在一个布尔型列表中
correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置
#求准确率
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#把correct_prediction变为float32类型
#初始化
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(init)
    for epoch in range(6):
        for batch in range(n_batch):
            batch_xs,batch_ys =  mnist.train.next_batch(batch_size)
            sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})

        acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})
        print ("Iter " + str(epoch) + ", Testing Accuracy= " + str(acc))
    saver.save(sess,'net/my_net.ckpt')

# In[ ]:
	#forked from https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-069
	# This Python 3 environment comes with many helpful analytics libraries installed
	# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
	# For example, here's several helpful packages to load in

	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

	# Input data files are available in the "../input/" directory.
	# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

	from subprocess import check_output
	print(check_output(["ls", "../input"]).decode("utf8"))

	# Any results you write to the current directory are saved as output.

	from keras.models import Model
	from keras.layers import Dense, Embedding, Input
	from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
	from keras.preprocessing import text, sequence
	from keras.callbacks import EarlyStopping, ModelCheckpoint

	max_features = 20000
	maxlen = 100


	train = pd.read_csv("../input/train.csv")
	test = pd.read_csv("../input/test.csv")
	train = train.sample(frac=1)

	list_sentences_train = train["comment_text"].fillna("CVxTz").values
	list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
	y = train[list_classes].values
	list_sentences_test = test["comment_text"].fillna("CVxTz").values


	tokenizer = text.Tokenizer(num_words=max_features)
	tokenizer.fit_on_texts(list(list_sentences_train))
	list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
	list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
	X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
	X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

	def get_model():
	embed_size = 128
	inp = Input(shape=(maxlen, ))
	x = Embedding(max_features, embed_size)(inp)
	x = Bidirectional(LSTM(50, return_sequences=True))(x)
	x = GlobalMaxPool1D()(x)
	x = Dropout(0.1)(x)
	x = Dense(50, activation="relu")(x)
	x = Dropout(0.1)(x)
	x = Dense(6, activation="sigmoid")(x)
	model = Model(inputs=inp, outputs=x)
	model.compile(loss='binary_crossentropy',
	optimizer='adam',
	metrics=['accuracy'])

	return model


	model = get_model()
	batch_size = 32
	epochs = 2


	file_path="weights_base.best.hdf5"
	checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

	early = EarlyStopping(monitor="val_loss", mode="min", patience=20)


	callbacks_list = [checkpoint, early] #early
	model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

	model.load_weights(file_path)

	y_test = model.predict(X_te)



	sample_submission = pd.read_csv("../input/sample_submission.csv")

	sample_submission[list_classes] = y_test



	sample_submission.to_csv("baseline.csv", index=False)

	"""
	本文主要使用一维卷积核的CNN进行文本分类（二维卷积主要用于图像处理）
	步骤 1：测试文本的预处理，分词->去去除停用词->统计选择top 5000的词做为特征词
	步骤 2：为每个特征词生成ID
	步骤 3：将文本转化成ID序列，并将左侧补齐
	步骤 4：训练集shuffle
	步骤 5：Embedding Layer 将词转化为词向量
	步骤 6：添加Conv1D卷积层
	步骤 7：添加池化层
	步骤 7：添加全连接层，loss function = binary_crossentropy
	步骤 8：输出层使用Sigmoid
	from:https://blog.csdn.net/xiewenbo/article/details/77874080
	"""

	from __future__ import print_function

	from keras.preprocessing import sequence
	from keras.models import Sequential
	from keras.layers import Dense, Dropout, Activation
	from keras.layers import Embedding
	from keras.layers import Conv1D, GlobalMaxPooling1D
	from keras.datasets import imdb
	from keras.models import model_from_json
	import numpy as np
	# set parameters:
	max_features = 5001
	maxlen = 100
	batch_size = 32
	embedding_dims = 50
	filters = 250
	kernel_size = 3
	hidden_dims = 250
	epochs = 10


	x_train=np.loadtxt("x_train.txt",dtype=int)
	y_train=np.loadtxt("y_train.txt",dtype=int)

	indices = np.arange(x_train.shape[0])
	np.random.shuffle(indices)
	x_train = x_train[indices]
	y_train = y_train[indices]

	print('Loading data...')
	#x_train=np.loadtxt("x_train.txt",dtype=int)
	#y_train=np.loadtxt("y_train.txt",dtype=int)
	x_test=x_train[20000:]
	y_test=y_train[20000:]
	x_train=x_train[:20000]
	y_train=y_train[:20000]
	#x_test=x_train
	#y_test=y_train
	print(len(x_train), 'train sequences')
	print(len(x_test), 'test sequences')
	print(x_train[:1])
	print('Pad sequences (samples x time)')
	x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
	x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
	print('x_train shape:', x_train.shape)
	print('x_test shape:', x_test.shape)

	print('Build model...')
	model = Sequential()

	# we start off with an efficient embedding layer which maps
	# our vocab indices into embedding_dims dimensions
	model.add(Embedding(max_features,
	embedding_dims,
	input_length=maxlen))
	model.add(Dropout(0.5))

	# we add a Convolution1D, which will learn filters
	# word group filters of size filter_length:
	model.add(Conv1D(filters,
	kernel_size,
	padding='valid',
	activation='relu',
	strides=1))
	# we use max pooling:
	model.add(GlobalMaxPooling1D())

	# We add a vanilla hidden layer:
	model.add(Dense(hidden_dims))
	model.add(Dropout(0.5))
	model.add(Activation('relu'))

	# We project onto a single unit output layer, and squash it with a sigmoid:
	model.add(Dense(1))
	model.add(Activation('sigmoid'))

	model.compile(loss='binary_crossentropy',
	optimizer='adam',
	metrics=['accuracy'])
	model.fit(x_train, y_train,
	batch_size=batch_size,
	epochs=epochs,
	validation_data=(x_test, y_test))

	model_json = model.to_json()
	with open("model.json", "w") as json_file:
	json_file.write(model_json)
	model.save_weights("model.h5")
	json_file = open('model.json', 'r')
	loaded_model_json = json_file.read()
	json_file.close()
	loaded_model = model_from_json(loaded_model_json)
	# load weights into new model
	loaded_model.load_weights("model.h5")
	print("Loaded model from disk")
	loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
	score = loaded_model.evaluate(x_test, y_test, verbose=0)
	print(score)
	# coding: utf-8

	# In[1]:

	import tensorflow as tf
	from tensorflow.examples.tutorials.mnist import input_data


	# In[2]:

	#载入数据集
	mnist = input_data.read_data_sets("MNIST_data/",one_hot=True)

	# 输入图片是28*28
	n_inputs = 28 #输入一行，一行有28个数据
	max_time = 28 #一共28行
	lstm_size = 500 #隐层单元
	n_classes = 10 # 10个分类
	batch_size = 50 #每批次50个样本
	n_batch = mnist.train.num_examples // batch_size #计算一共有多少个批次
	print(n_batch)
	#这里的none表示第一个维度可以是任意的长度
	x = tf.placeholder(tf.float32,[None,784])
	#正确的标签
	y = tf.placeholder(tf.float32,[None,10])

	#初始化权值
	weights = tf.Variable(tf.truncated_normal([lstm_size, n_classes], stddev=0.1))
	#初始化偏置值
	biases = tf.Variable(tf.constant(0.1, shape=[n_classes]))


	#定义RNN网络
	def RNN(X,weights,biases):
	# inputs=[batch_size, max_time, n_inputs]
	inputs = tf.reshape(X,[-1,max_time,n_inputs])
	#定义LSTM基本CELL
	lstm_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(lstm_size)
	# final_state[0]是cell state
	# final_state[1]是hidden_state 最后500个隐藏单元的输出结果，
	# output 与time.major 如果是false 返回是 batch_size=50次, maxtime=28的长度，cell.output_size:500个隐藏单元
	# 但是time是 0到27，则cell.output_size 则表示对应时间的500个输出结果。
	# finale_state返回：
	# state【】包括 cell_state 中间的celll，
	# hidden_state 最后的结果输出。
	# batch_size 50次
	# state_size: 隐藏单元个数 500

	# 隐藏层的单元个数，（batch次数，个数，数据数），格式
	outputs,final_state = tf.nn.dynamic_rnn(lstm_cell,inputs,dtype=tf.float32)
	print(outputs[2])
	results = tf.nn.softmax(tf.matmul(final_state[1],weights) + biases)
	return results


	#计算RNN的返回结果
	prediction= RNN(x, weights, biases)
	#损失函数
	cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
	#使用AdamOptimizer进行优化
	train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
	#结果存放在一个布尔型列表中
	correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置
	#求准确率
	accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#把correct_prediction变为float32类型
	#初始化
	init = tf.global_variables_initializer()
	saver = tf.train.Saver()
	with tf.Session() as sess:
	sess.run(init)
	for epoch in range(6):
	for batch in range(n_batch):
	batch_xs,batch_ys = mnist.train.next_batch(batch_size)
	sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})

	acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})
	print ("Iter " + str(epoch) + ", Testing Accuracy= " + str(acc))
	saver.save(sess,'net/my_net.ckpt')

	# In[ ]: