agrawal-rohit/text_analysis.py

## text_analysis.py
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.utils.np_utils import to_categorical

# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 20000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 50
# This is fixed.
EMBEDDING_DIM = 100

# Combining titles and descriptions into a single sentence
titles = data['Title'].values
descriptions = data['Description'].values
data_for_lstms = []
for i in range(len(titles)):
    temp_list = [titles[i], descriptions[i]]
    data_for_lstms.append(' '.join(temp_list))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data_for_lstms)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Convert the data to padded sequences
X = tokenizer.texts_to_sequences(data_for_lstms)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

# One-hot Encode labels
Y = pd.get_dummies(data['Category']).values
print('Shape of label tensor:', Y.shape)

# Splitting into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state = 42)

# Define LSTM Model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(6, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Training LSTM Model
epochs = 5
batch_size = 64
history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.legend()
plt.show();
	from keras.preprocessing.text import Tokenizer
	from keras.preprocessing.sequence import pad_sequences
	from keras.models import Sequential
	from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
	from keras.utils.np_utils import to_categorical

	# The maximum number of words to be used. (most frequent)
	MAX_NB_WORDS = 20000
	# Max number of words in each complaint.
	MAX_SEQUENCE_LENGTH = 50
	# This is fixed.
	EMBEDDING_DIM = 100

	# Combining titles and descriptions into a single sentence
	titles = data['Title'].values
	descriptions = data['Description'].values
	data_for_lstms = []
	for i in range(len(titles)):
	temp_list = [titles[i], descriptions[i]]
	data_for_lstms.append(' '.join(temp_list))

	tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{\|}~', lower=True)
	tokenizer.fit_on_texts(data_for_lstms)
	word_index = tokenizer.word_index
	print('Found %s unique tokens.' % len(word_index))

	# Convert the data to padded sequences
	X = tokenizer.texts_to_sequences(data_for_lstms)
	X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
	print('Shape of data tensor:', X.shape)

	# One-hot Encode labels
	Y = pd.get_dummies(data['Category']).values
	print('Shape of label tensor:', Y.shape)

	# Splitting into training and test set
	X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state = 42)

	# Define LSTM Model
	model = Sequential()
	model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
	model.add(SpatialDropout1D(0.2))
	model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
	model.add(Dense(6, activation='softmax'))
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	print(model.summary())

	# Training LSTM Model
	epochs = 5
	batch_size = 64
	history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1)

	plt.title('Loss')
	plt.plot(history.history['loss'], label='train')
	plt.plot(history.history['val_loss'], label='test')
	plt.legend()
	plt.show();

	plt.title('Accuracy')
	plt.plot(history.history['acc'], label='train')
	plt.plot(history.history['val_acc'], label='test')
	plt.legend()
	plt.show();