Skip to content

Instantly share code, notes, and snippets.

@masaponto
Last active August 24, 2019 13:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save masaponto/bcbf0c40fddb1a714eb7312cdddf4878 to your computer and use it in GitHub Desktop.
Save masaponto/bcbf0c40fddb1a714eb7312cdddf4878 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, MaxPooling2D, Flatten, Dense, Dropout
import numpy as np
def generate_data():
x = np.random.rand(100, 20, 30, 10)
t = np.concatenate([np.ones(50),np.zeros(50)])
return x, t
def main():
model = Sequential()
x, t = generate_data()
model.add(Conv2D(filters=5,
kernel_size=(5, 5),
strides=(1, 1),
activation='tanh',
input_shape=(20, 30, 10),
padding='same'))
model.add(MaxPooling2D(pool_size=(2,2),
strides=2))
model.add(Conv2D(filters=5,
kernel_size=(5, 5),
strides=(1, 1),
activation='tanh',
padding='same'))
model.add(MaxPooling2D(pool_size=(2,2),
strides=2))
model.add(Flatten())
#model.summary()
model.add(Dense(units=100,
activation='relu'))
model.add(Dropout(rate=0.4))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy', tf.keras.metrics.AUC()])
model.summary()
history = model.fit(x, t, batch_size=32, epochs=20)
print(history)
if __name__ == "__main__":
main()
#!/usr/bin/env python
"""
Referenced
https://ohke.hateblo.jp/entry/2019/04/27/154500
"""
import pandas as pd
import numpy as np
import tensorflow as tf
def load_data():
pd.set_option('display.max_colwidth', 100)
df_dataset = pd.read_csv('./SMSSpamCollection', sep='\t', header=None)
df_dataset.rename({0: 'label', 1:'text'}, axis=1, inplace=True)
df_dataset['category'] = df_dataset\
.apply(lambda x: 1 if x['label'] == 'spam' else 0, axis=1)
print('data load sucsesfully done!')
print(df_dataset.head())
return df_dataset
def split_dataset(df_dataset):
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
df_dataset[['text']], df_dataset[['category']],
test_size=0.2, random_state=0
)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
return X_train, X_test, Y_train, Y_test
def prepro(X_train, X_test):
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train['text'])
x_v_train = tokenizer.texts_to_sequences(X_train['text'])
x_v_test = tokenizer.texts_to_sequences(X_test['text'])
print(X_train.head())
for text, vector in zip(X_train['text'].head(3), x_v_train[0:3]):
print(text)
print(vector)
print(len(x_v_train[0]), len(x_v_test[0]))
x_v_train = pad_sequences(x_v_train, maxlen=max_len)
x_v_test = pad_sequences(x_v_test, maxlen=max_len)
print(len(x_v_train[0]), len(x_v_test[0]))
return x_v_train, x_v_test, tokenizer
def train(tokenizer, x_v_train, x_v_test, Y_train, Y_test):
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
vocabulary_size = len(tokenizer.word_index) + 1
print('vocab size', vocabulary_size)
model = Sequential()
model.add(Embedding(input_dim=vocabulary_size, output_dim=32))
#model.add(LSTM(16, return_sequences=False, input_dim=1, input_length=vocabulary_size))
model.add(LSTM(16, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
y_train = Y_train['category'].values
y_test = Y_test['category'].values
print(np.array(x_v_train).shape)
print(np.array(y_train).shape)
print(np.array(x_v_test).shape)
print(np.array(y_test).shape)
history = model.fit(x_v_train, y_train, batch_size=32, epochs=20,
validation_data=(x_v_test, y_test))
return model
def main():
df_dataset = load_data()
X_train, X_test, Y_train, Y_test = split_dataset(df_dataset)
print(Y_train.head())
print(Y_test.head())
x_v_train, x_v_test, tokenizer = prepro(X_train, X_test)
model = train(tokenizer, x_v_train, x_v_test, Y_train, Y_test)
if __name__ == "__main__":
main()
#!/usr/bin/env python
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from sklearn.metrics import roc_auc_score
import tensorflow as tf
def generate_data():
x = np.random.rand(100, 20, 30)
t = np.concatenate([np.ones(50),np.zeros(50)])
return x, t
def main():
#vocabulary_size = len(tokenizer.word_index) + 1
#print('vocab size', vocabulary_size)
x, t = generate_data()
model = Sequential()
#model.add(Embedding(input_dim=vocabulary_size, output_dim=32))
model.add(LSTM(16, return_sequences=False, input_dim=30, input_length=20))
#model.add(LSTM(16, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
optimizer='adam',
metrics=['accuracy', tf.keras.metrics.AUC()])
model.summary()
history = model.fit(x, t, batch_size=32, epochs=20)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment