Skip to content

Instantly share code, notes, and snippets.

@limitpointinf0
Last active May 8, 2018 04:05
Show Gist options
  • Save limitpointinf0/f9ad24ff75c25bd1588fcf1d4d4bd20d to your computer and use it in GitHub Desktop.
Save limitpointinf0/f9ad24ff75c25bd1588fcf1d4d4bd20d to your computer and use it in GitHub Desktop.
Simple LSTM for NLP
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import re
from nltk.tokenize import word_tokenize
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD, Adam
import os
def mk_token(txt, char_lvl=True, low_cs=True, punct=True, stops=None, stem=None):
"""function with options for text cleaning and tokenizing"""
if char_lvl:
txt = list(txt)
else:
txt = word_tokenize(txt)
if low_cs:
txt = [w.lower() for w in txt]
if not punct:
table = str.maketrans('','', string.punctuation)
txt = [w.translate(table) for w in txt]
txt = [w for w in txt if w.isalpha()]
if not (stops is None):
stop_words = set(stops)
txt = [w for w in txt if not w in stop_words]
if not (stem is None):
stemmer = stem
txt = [porter.stem(w) for w in txt]
return txt
def code_vocab(txt, forw=True):
"""Remove duplicate tokens and enumerate."""
vocab = list(set(txt))
ch_int = dict((c,i) for i, c in enumerate(vocab))
int_ch = dict((i,c) for i, c in enumerate(vocab))
return ch_int, int_ch
def seq_text(txt, seq_length=1):
X, y = ([], [])
for i in range(0, len(txt) - seq_length, 1):
seq_in = txt[i:i+seq_length]
seq_out = txt[i+seq_length]
X.append(seq_in)
y.append(seq_out)
n_rows = len(X)
X = np.reshape(X, (n_rows, seq_length, 1))
X = X/float(len(set(txt)))
y = np.array(keras.utils.to_categorical(y))
return X, y
#Define a function which creates a keras LSTM model with hidden layers, activation functions, and dropout rates
def simple_LSTM(input_shape, nodes_per=[60], hidden=0, out=2, act_out='softmax', act_hid='relu', drop=True, d_rate=0.1):
"""Generate a keras neural network with arbitrary number of hidden layers, activation functions, dropout rates, etc"""
model = Sequential()
#adding first hidden layer with 60 nodes (first value in nodes_per list)
model.add(LSTM(nodes_per[0],input_shape=input_shape, return_sequences=True))
if drop:
model.add(Dropout(d_rate))
try:
if hidden != 0:
for i,j in zip(range(hidden), nodes_per[1:]):
model.add(LSTM(j))
if drop:
model.add(Dropout(d_rate))
model.add(Dense(out,activation=act_out))
return(model)
except:
print('Error in generating hidden layers')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment