Skip to content

Instantly share code, notes, and snippets.

@greed2411
Created July 2, 2018 08:03
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save greed2411/f12a5e0c604923f1fb654df87105dee5 to your computer and use it in GitHub Desktop.
Save greed2411/f12a5e0c604923f1fb654df87105dee5 to your computer and use it in GitHub Desktop.
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Dropout
from keras.layers import LSTM
from keras.regularizers import l2
from keras.optimizers import Adam
# assume my dataframe `train` has two columns one is for phrase, another one is the target `Sentiment`
# mine is a text example, which has five output classes.
# Tokenizer is used for doing the splitting, filtering and sequencing the words into numbers based on dictionary.
# character level sequencing can be done, if `char_level` is made true in Tokenizer
# converts to lower case by default.
# More on here: https://keras.io/preprocessing/text/ or hit docstrings.
tok_raw = Tokenizer()
tok_raw.fit_on_texts(train['Phrase'])
# for converting Phrase column into vectors, by creating a new column 'seq_phrase'
train["seq_phrase"] = tok_raw.texts_to_sequences(train['Phrase'])
MAX_PHRASE_SEQ = 20 # seq_length or timestep
VOCAB_SIZE = len(tok_raw.word_index)+1 # dictionary size
def get_keras_data(dataset):
df_dict = {
'phrase': pad_sequences(dataset.seq_phrase, maxlen=MAX_PHRASE_SEQ),
'sentiment': to_categorical(dataset.Sentiment)
}
return df_dict
train_dict = get_keras_data(train) # note train is a df here.
X_train, X_test, y_train, y_test = train_test_split(train_dict['phrase'], train_dict['sentiment'],
test_size=0.2, random_state=42)
EMBEDDING_DIM = 256
OUTPUT_DIM = train.Sentiment.nunique()
model = Sequential()
model.add(Embedding(VOCAB_SIZE, EMBEDDING_DIM, input_length= MAX_PHRASE_SEQ))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(OUTPUT_DIM, activation='softmax', kernel_regularizer=l2(0.02))) # use sigmoid if it's the case of multi-label classification
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment