limitpointinf0/lstm_nlp.py

## lstm_nlp.py
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import re
from nltk.tokenize import word_tokenize
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.optimizers import SGD, Adam
import os

def mk_token(txt, char_lvl=True, low_cs=True, punct=True, stops=None, stem=None):
    """function with options for text cleaning and tokenizing"""
    if char_lvl:
        txt = list(txt)
    else:
        txt = word_tokenize(txt)
    if low_cs:
        txt = [w.lower() for w in txt]
    if not punct:
        table = str.maketrans('','', string.punctuation)
        txt = [w.translate(table) for w in txt]
        txt = [w for w in txt if w.isalpha()]
    if not (stops is None):
        stop_words = set(stops)
        txt = [w for w in txt if not w in stop_words]
    if not (stem is None):
        stemmer = stem
        txt = [porter.stem(w) for w in txt]
    return txt

def code_vocab(txt, forw=True):
    """Remove duplicate tokens and enumerate."""
    vocab = list(set(txt))
    ch_int = dict((c,i) for i, c in enumerate(vocab))
    int_ch = dict((i,c) for i, c in enumerate(vocab))
    return ch_int, int_ch

def seq_text(txt, seq_length=1):
    X, y = ([], [])
    for i in range(0, len(txt) - seq_length, 1):
        seq_in = txt[i:i+seq_length]
        seq_out = txt[i+seq_length]
        X.append(seq_in)
        y.append(seq_out)
    n_rows = len(X)
    X = np.reshape(X, (n_rows, seq_length, 1))
    X = X/float(len(set(txt)))
    y = np.array(keras.utils.to_categorical(y))
    return X, y

#Define a function which creates a keras LSTM model with hidden layers, activation functions, and dropout rates
def simple_LSTM(input_shape, nodes_per=[60], hidden=0, out=2, act_out='softmax', act_hid='relu', drop=True, d_rate=0.1):
  """Generate a keras neural network with arbitrary number of hidden layers, activation functions, dropout rates, etc"""
  model = Sequential()
  #adding first hidden layer with 60 nodes (first value in nodes_per list)
  model.add(LSTM(nodes_per[0],input_shape=input_shape, return_sequences=True))
  if drop:
      model.add(Dropout(d_rate))
  try:
    if hidden != 0:
      for i,j in zip(range(hidden), nodes_per[1:]):
          model.add(LSTM(j))
          if drop:
              model.add(Dropout(d_rate))
    model.add(Dense(out,activation=act_out))
    return(model)
  except:
    print('Error in generating hidden layers')
	import numpy as np # linear algebra
	import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
	import string
	import re
	from nltk.tokenize import word_tokenize
	import keras
	from keras.models import Sequential
	from keras.layers import Dense, Dropout, LSTM
	from keras.utils import to_categorical
	from keras.callbacks import EarlyStopping
	from keras.optimizers import SGD, Adam
	import os

	def mk_token(txt, char_lvl=True, low_cs=True, punct=True, stops=None, stem=None):
	"""function with options for text cleaning and tokenizing"""
	if char_lvl:
	txt = list(txt)
	else:
	txt = word_tokenize(txt)
	if low_cs:
	txt = [w.lower() for w in txt]
	if not punct:
	table = str.maketrans('','', string.punctuation)
	txt = [w.translate(table) for w in txt]
	txt = [w for w in txt if w.isalpha()]
	if not (stops is None):
	stop_words = set(stops)
	txt = [w for w in txt if not w in stop_words]
	if not (stem is None):
	stemmer = stem
	txt = [porter.stem(w) for w in txt]
	return txt

	def code_vocab(txt, forw=True):
	"""Remove duplicate tokens and enumerate."""
	vocab = list(set(txt))
	ch_int = dict((c,i) for i, c in enumerate(vocab))
	int_ch = dict((i,c) for i, c in enumerate(vocab))
	return ch_int, int_ch

	def seq_text(txt, seq_length=1):
	X, y = ([], [])
	for i in range(0, len(txt) - seq_length, 1):
	seq_in = txt[i:i+seq_length]
	seq_out = txt[i+seq_length]
	X.append(seq_in)
	y.append(seq_out)
	n_rows = len(X)
	X = np.reshape(X, (n_rows, seq_length, 1))
	X = X/float(len(set(txt)))
	y = np.array(keras.utils.to_categorical(y))
	return X, y

	#Define a function which creates a keras LSTM model with hidden layers, activation functions, and dropout rates
	def simple_LSTM(input_shape, nodes_per=[60], hidden=0, out=2, act_out='softmax', act_hid='relu', drop=True, d_rate=0.1):
	"""Generate a keras neural network with arbitrary number of hidden layers, activation functions, dropout rates, etc"""
	model = Sequential()
	#adding first hidden layer with 60 nodes (first value in nodes_per list)
	model.add(LSTM(nodes_per[0],input_shape=input_shape, return_sequences=True))
	if drop:
	model.add(Dropout(d_rate))
	try:
	if hidden != 0:
	for i,j in zip(range(hidden), nodes_per[1:]):
	model.add(LSTM(j))
	if drop:
	model.add(Dropout(d_rate))
	model.add(Dense(out,activation=act_out))
	return(model)
	except:
	print('Error in generating hidden layers')