Last active
May 8, 2018 04:05
-
-
Save limitpointinf0/f9ad24ff75c25bd1588fcf1d4d4bd20d to your computer and use it in GitHub Desktop.
Simple LSTM for NLP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np # linear algebra | |
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) | |
import string | |
import re | |
from nltk.tokenize import word_tokenize | |
import keras | |
from keras.models import Sequential | |
from keras.layers import Dense, Dropout, LSTM | |
from keras.utils import to_categorical | |
from keras.callbacks import EarlyStopping | |
from keras.optimizers import SGD, Adam | |
import os | |
def mk_token(txt, char_lvl=True, low_cs=True, punct=True, stops=None, stem=None): | |
"""function with options for text cleaning and tokenizing""" | |
if char_lvl: | |
txt = list(txt) | |
else: | |
txt = word_tokenize(txt) | |
if low_cs: | |
txt = [w.lower() for w in txt] | |
if not punct: | |
table = str.maketrans('','', string.punctuation) | |
txt = [w.translate(table) for w in txt] | |
txt = [w for w in txt if w.isalpha()] | |
if not (stops is None): | |
stop_words = set(stops) | |
txt = [w for w in txt if not w in stop_words] | |
if not (stem is None): | |
stemmer = stem | |
txt = [porter.stem(w) for w in txt] | |
return txt | |
def code_vocab(txt, forw=True): | |
"""Remove duplicate tokens and enumerate.""" | |
vocab = list(set(txt)) | |
ch_int = dict((c,i) for i, c in enumerate(vocab)) | |
int_ch = dict((i,c) for i, c in enumerate(vocab)) | |
return ch_int, int_ch | |
def seq_text(txt, seq_length=1): | |
X, y = ([], []) | |
for i in range(0, len(txt) - seq_length, 1): | |
seq_in = txt[i:i+seq_length] | |
seq_out = txt[i+seq_length] | |
X.append(seq_in) | |
y.append(seq_out) | |
n_rows = len(X) | |
X = np.reshape(X, (n_rows, seq_length, 1)) | |
X = X/float(len(set(txt))) | |
y = np.array(keras.utils.to_categorical(y)) | |
return X, y | |
#Define a function which creates a keras LSTM model with hidden layers, activation functions, and dropout rates | |
def simple_LSTM(input_shape, nodes_per=[60], hidden=0, out=2, act_out='softmax', act_hid='relu', drop=True, d_rate=0.1): | |
"""Generate a keras neural network with arbitrary number of hidden layers, activation functions, dropout rates, etc""" | |
model = Sequential() | |
#adding first hidden layer with 60 nodes (first value in nodes_per list) | |
model.add(LSTM(nodes_per[0],input_shape=input_shape, return_sequences=True)) | |
if drop: | |
model.add(Dropout(d_rate)) | |
try: | |
if hidden != 0: | |
for i,j in zip(range(hidden), nodes_per[1:]): | |
model.add(LSTM(j)) | |
if drop: | |
model.add(Dropout(d_rate)) | |
model.add(Dense(out,activation=act_out)) | |
return(model) | |
except: | |
print('Error in generating hidden layers') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment