vgpena/loadModel.py

## readme.md

      
    Raw
  

              readme.md
            
          
    Text Classification with Keras and TensorFlow

Blog post is here

If you want an intro to neural nets and the "long version" of what this is and what it does, read my blog post.
Data can be downloaded here. Many thanks to ThinkNook for putting such a great resource out there.
Installation

You need Python 2 to run this project; I also recommend Virtualenv and iPython.
Run pip install to install everything listed in requirements.txt.
Usage

You need to train your net once, and then you can load those settings and use it whenever you want without having to retrain it.
Training

Change line 10 of makeModel.py to point to wherever you downloaded your data as a CSV.
Then run Python makeModel.py (or, if you're in iPython, run makeModel.py). Then go do something else for the 40-60 minutes that it takes to train your neural net.
When creating the net finishes, three new files should have been created: dictionary.json, model.json, and model.h5. You will need these to use the net.
Classification

To use the net to classify data, run loadModel.py and type into the console when prompted. Hitting Enter without typing anything will quit the program.

  
## loadModel.py
import json
import numpy as np
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
from keras.models import model_from_json

# we're still going to use a Tokenizer here, but we don't need to fit it
tokenizer = Tokenizer(num_words=3000)
# for human-friendly printing
labels = ['negative', 'positive']

# read in our saved dictionary
with open('dictionary.json', 'r') as dictionary_file:
    dictionary = json.load(dictionary_file)

# this utility makes sure that all the words in your input
# are registered in the dictionary
# before trying to turn them into a matrix.
def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training corpus; ignoring." %(word))
    return wordIndices

# read in your saved model structure
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
# and create a model from that
model = model_from_json(loaded_model_json)
# and weight your nodes with your saved values
model.load_weights('model.h5')

# okay here's the interactive part
while 1:
    evalSentence = raw_input('Input a sentence to be evaluated, or Enter to quit: ')

    if len(evalSentence) == 0:
        break

    # format your input for the neural net
    testArr = convert_text_to_index_array(evalSentence)
    input = tokenizer.sequences_to_matrix([testArr], mode='binary')
    # predict which bucket your input belongs in
    pred = model.predict(input)
    # and print it for the humons
    print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))

## makeModel.py
import json
import keras
import keras.preprocessing.text as kpt
from keras.preprocessing.text import Tokenizer
import numpy as np

# extract data from a csv
# notice the cool options to skip lines at the beginning
# and to only take data from certain columns
training = np.genfromtxt('/path/to/your/data.csv', delimiter=',', skip_header=1, usecols=(1, 3), dtype=None)

# create our training data from the tweets
train_x = [x[1] for x in training]
# index all the sentiment labels
train_y = np.asarray([x[0] for x in training])

# only work with the 3000 most popular words found in our dataset
max_words = 3000

# create a new Tokenizer
tokenizer = Tokenizer(num_words=max_words)
# feed our tweets to the Tokenizer
tokenizer.fit_on_texts(train_x)

# Tokenizers come with a convenient list of words and IDs
dictionary = tokenizer.word_index
# Let's save this out so we can use it later
with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

allWordIndices = []
# for each tweet, change each token to its ID in the Tokenizer's word_index
for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

# now we have a list of all tweets converted to index arrays.
# cast as an array for future usage.
allWordIndices = np.asarray(allWordIndices)

# create one-hot matrices out of the indexed tweets
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
# treat the labels as categories
train_y = keras.utils.to_categorical(train_y, 2)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation

model = Sequential()
model.add(Dense(512, input_shape=(max_words,), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

model.fit(train_x, train_y,
    batch_size=32,
    epochs=5,
    verbose=1,
    validation_split=0.1,
    shuffle=True)

model_json = model.to_json()
with open('model.json', 'w') as json_file:
    json_file.write(model_json)

model.save_weights('model.h5')

print('saved model!')

## requirements.txt
backports.weakref==1.0rc1
bleach==1.5.0
funcsigs==1.0.2
html5lib==0.9999999
Keras==2.0.6
Markdown==2.2.0
mock==2.0.0
numpy==1.13.1
pbr==3.1.1
protobuf==3.3.0
PyYAML==3.12
scipy==0.19.1
six==1.10.0
tensorflow==1.2.0
Theano==0.9.0
Werkzeug==0.12.2
	import json
	import numpy as np
	import keras
	import keras.preprocessing.text as kpt
	from keras.preprocessing.text import Tokenizer
	from keras.models import model_from_json

	# we're still going to use a Tokenizer here, but we don't need to fit it
	tokenizer = Tokenizer(num_words=3000)
	# for human-friendly printing
	labels = ['negative', 'positive']

	# read in our saved dictionary
	with open('dictionary.json', 'r') as dictionary_file:
	dictionary = json.load(dictionary_file)

	# this utility makes sure that all the words in your input
	# are registered in the dictionary
	# before trying to turn them into a matrix.
	def convert_text_to_index_array(text):
	words = kpt.text_to_word_sequence(text)
	wordIndices = []
	for word in words:
	if word in dictionary:
	wordIndices.append(dictionary[word])
	else:
	print("'%s' not in training corpus; ignoring." %(word))
	return wordIndices

	# read in your saved model structure
	json_file = open('model.json', 'r')
	loaded_model_json = json_file.read()
	json_file.close()
	# and create a model from that
	model = model_from_json(loaded_model_json)
	# and weight your nodes with your saved values
	model.load_weights('model.h5')

	# okay here's the interactive part
	while 1:
	evalSentence = raw_input('Input a sentence to be evaluated, or Enter to quit: ')

	if len(evalSentence) == 0:
	break

	# format your input for the neural net
	testArr = convert_text_to_index_array(evalSentence)
	input = tokenizer.sequences_to_matrix([testArr], mode='binary')
	# predict which bucket your input belongs in
	pred = model.predict(input)
	# and print it for the humons
	print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))
	backports.weakref==1.0rc1
	bleach==1.5.0
	funcsigs==1.0.2
	html5lib==0.9999999
	Keras==2.0.6
	Markdown==2.2.0
	mock==2.0.0
	numpy==1.13.1
	pbr==3.1.1
	protobuf==3.3.0
	PyYAML==3.12
	scipy==0.19.1
	six==1.10.0
	tensorflow==1.2.0
	Theano==0.9.0
	Werkzeug==0.12.2