Skip to content

Instantly share code, notes, and snippets.

@Lanme
Last active November 6, 2018 10:13
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Lanme/c85d7e272785d07179aeef326fd6fc76 to your computer and use it in GitHub Desktop.
Save Lanme/c85d7e272785d07179aeef326fd6fc76 to your computer and use it in GitHub Desktop.
some_simple_algorithm
#forked from https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-069
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
# Any results you write to the current directory are saved as output.
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
max_features = 20000
maxlen = 100
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
train = train.sample(frac=1)
list_sentences_train = train["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values
list_sentences_test = test["comment_text"].fillna("CVxTz").values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)
def get_model():
embed_size = 128
inp = Input(shape=(maxlen, ))
x = Embedding(max_features, embed_size)(inp)
x = Bidirectional(LSTM(50, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
model = get_model()
batch_size = 32
epochs = 2
file_path="weights_base.best.hdf5"
checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
early = EarlyStopping(monitor="val_loss", mode="min", patience=20)
callbacks_list = [checkpoint, early] #early
model.fit(X_t, y, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)
model.load_weights(file_path)
y_test = model.predict(X_te)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = y_test
sample_submission.to_csv("baseline.csv", index=False)
"""
本文主要使用一维卷积核的CNN进行文本分类(二维卷积主要用于图像处理)
步骤 1:测试文本的预处理,分词->去去除停用词->统计选择top 5000的词做为特征词
步骤 2:为每个特征词生成ID
步骤 3:将文本转化成ID序列,并将左侧补齐
步骤 4:训练集shuffle
步骤 5:Embedding Layer 将词转化为词向量
步骤 6:添加Conv1D卷积层
步骤 7:添加池化层
步骤 7:添加全连接层,loss function = binary_crossentropy
步骤 8:输出层使用Sigmoid
from:https://blog.csdn.net/xiewenbo/article/details/77874080
"""
from __future__ import print_function
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.datasets import imdb
from keras.models import model_from_json
import numpy as np
# set parameters:
max_features = 5001
maxlen = 100
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 10
x_train=np.loadtxt("x_train.txt",dtype=int)
y_train=np.loadtxt("y_train.txt",dtype=int)
indices = np.arange(x_train.shape[0])
np.random.shuffle(indices)
x_train = x_train[indices]
y_train = y_train[indices]
print('Loading data...')
#x_train=np.loadtxt("x_train.txt",dtype=int)
#y_train=np.loadtxt("y_train.txt",dtype=int)
x_test=x_train[20000:]
y_test=y_train[20000:]
x_train=x_train[:20000]
y_train=y_train[:20000]
#x_test=x_train
#y_test=y_train
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print(x_train[:1])
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('Build model...')
model = Sequential()
# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
embedding_dims,
input_length=maxlen))
model.add(Dropout(0.5))
# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
kernel_size,
padding='valid',
activation='relu',
strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.5))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.fit(x_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=(x_test, y_test))
model_json = model.to_json()
with open("model.json", "w") as json_file:
json_file.write(model_json)
model.save_weights("model.h5")
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
loaded_model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
score = loaded_model.evaluate(x_test, y_test, verbose=0)
print(score)
# coding: utf-8
# In[1]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
# In[2]:
#载入数据集
mnist = input_data.read_data_sets("MNIST_data/",one_hot=True)
# 输入图片是28*28
n_inputs = 28 #输入一行,一行有28个数据
max_time = 28 #一共28行
lstm_size = 500 #隐层单元
n_classes = 10 # 10个分类
batch_size = 50 #每批次50个样本
n_batch = mnist.train.num_examples // batch_size #计算一共有多少个批次
print(n_batch)
#这里的none表示第一个维度可以是任意的长度
x = tf.placeholder(tf.float32,[None,784])
#正确的标签
y = tf.placeholder(tf.float32,[None,10])
#初始化权值
weights = tf.Variable(tf.truncated_normal([lstm_size, n_classes], stddev=0.1))
#初始化偏置值
biases = tf.Variable(tf.constant(0.1, shape=[n_classes]))
#定义RNN网络
def RNN(X,weights,biases):
# inputs=[batch_size, max_time, n_inputs]
inputs = tf.reshape(X,[-1,max_time,n_inputs])
#定义LSTM基本CELL
lstm_cell = tf.contrib.rnn.core_rnn_cell.BasicLSTMCell(lstm_size)
# final_state[0]是cell state
# final_state[1]是hidden_state 最后500个隐藏单元的输出结果,
# output 与time.major 如果是false 返回是 batch_size=50次, maxtime=28的长度,cell.output_size:500个隐藏单元
# 但是time是 0到27,则cell.output_size 则表示对应时间的500个输出结果。
# finale_state返回:
# state【】 包括 cell_state 中间的celll,
# hidden_state 最后的结果输出。
# batch_size 50次
# state_size: 隐藏单元个数 500
# 隐藏层的单元个数, (batch次数,个数,数据数),格式
outputs,final_state = tf.nn.dynamic_rnn(lstm_cell,inputs,dtype=tf.float32)
print(outputs[2])
results = tf.nn.softmax(tf.matmul(final_state[1],weights) + biases)
return results
#计算RNN的返回结果
prediction= RNN(x, weights, biases)
#损失函数
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
#使用AdamOptimizer进行优化
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
#结果存放在一个布尔型列表中
correct_prediction = tf.equal(tf.argmax(y,1),tf.argmax(prediction,1))#argmax返回一维张量中最大的值所在的位置
#求准确率
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))#把correct_prediction变为float32类型
#初始化
init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(init)
for epoch in range(6):
for batch in range(n_batch):
batch_xs,batch_ys = mnist.train.next_batch(batch_size)
sess.run(train_step,feed_dict={x:batch_xs,y:batch_ys})
acc = sess.run(accuracy,feed_dict={x:mnist.test.images,y:mnist.test.labels})
print ("Iter " + str(epoch) + ", Testing Accuracy= " + str(acc))
saver.save(sess,'net/my_net.ckpt')
# In[ ]:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment