jesuisnicolasdavid/binary_text_classification_with_imdb.py

## binary_text_classification_with_imdb.py
#import data
train_pos = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/train/pos/train_pos.txt", engine='python', sep='delimiter', header=None)
train_neg = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/train/neg/train_neg.txt", engine='python', sep='delimiter', header=None)
test_pos = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/test/pos/test_pos.txt", engine='python', sep='delimiter', header=None)
test_neg = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/test/neg/test_neg.txt", engine='python', sep='delimiter', header=None)

#transform data
train_pos[1] = 1
train_neg[1] = 0
test_pos[1] = 1
test_neg[1] = 0
x_tn = pd.concat([train_pos,train_neg]).reset_index(drop=True)
x_tt = pd.concat([test_pos,test_neg]).reset_index(drop=True)
X_train, y_train = x_tn[0], x_tn[1]
X_test, y_test = x_tt[0], x_tt[1]
X_train_split = filter(None, [X_train.strip(' ').split(' ') for X_train in X_train])
X_test_split = filter(None, [X_test.strip(' ').split(' ') for X_test in X_test])

n_dim = 300
#Initialize model and build vocab
imdb_w2v = word2vec.Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(X_train_split)

#Train the model over train_reviews
imdb_w2v.train(X_train_split)

#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

from sklearn.preprocessing import scale
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_train_split])
train_vecs = scale(train_vecs)

#Train word2vec on test tweets
imdb_w2v.train(X_test_split)

#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_test_split])
test_vecs = scale(test_vecs)

y_train = np.reshape(y_train, (len(y_train), 1))
y_train = tf.concat(1, [1 - y_train, y_train])
y_test = np.reshape(y_test, (len(y_test), 1))

x = tf.placeholder(tf.float32, shape = [None, 300])
y = tf.placeholder(tf.float32, shape = [None, 2])
# Input -> Layer 1
W1 = tf.Variable(tf.zeros([300, 2]))
b1 = tf.Variable(tf.zeros([2]))
#h1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
# Calculating difference between label and output
pred = tf.nn.softmax(tf.matmul(x, W1) + b1)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred,y))
train_step = tf.train.GradientDescentOptimizer(0.3).minimize(cost)

with tf.Session() as sess:
    for i in xrange(200):
            init_op = tf.initialize_all_variables()
            sess.run(init_op)
            train_step.run(feed_dict = {x: train_vecs, y: y_train})
    correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    # Calculate accuracy
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
    print "Accuracy:", accuracy.eval({x: test_vecs, y: y_test})
	#import data
	train_pos = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/train/pos/train_pos.txt", engine='python', sep='delimiter', header=None)
	train_neg = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/train/neg/train_neg.txt", engine='python', sep='delimiter', header=None)
	test_pos = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/test/pos/test_pos.txt", engine='python', sep='delimiter', header=None)
	test_neg = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/test/neg/test_neg.txt", engine='python', sep='delimiter', header=None)

	#transform data
	train_pos[1] = 1
	train_neg[1] = 0
	test_pos[1] = 1
	test_neg[1] = 0
	x_tn = pd.concat([train_pos,train_neg]).reset_index(drop=True)
	x_tt = pd.concat([test_pos,test_neg]).reset_index(drop=True)
	X_train, y_train = x_tn[0], x_tn[1]
	X_test, y_test = x_tt[0], x_tt[1]
	X_train_split = filter(None, [X_train.strip(' ').split(' ') for X_train in X_train])
	X_test_split = filter(None, [X_test.strip(' ').split(' ') for X_test in X_test])

	n_dim = 300
	#Initialize model and build vocab
	imdb_w2v = word2vec.Word2Vec(size=n_dim, min_count=10)
	imdb_w2v.build_vocab(X_train_split)

	#Train the model over train_reviews
	imdb_w2v.train(X_train_split)

	#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
	def buildWordVector(text, size):
	vec = np.zeros(size).reshape((1, size))
	count = 0.
	for word in text:
	try:
	vec += imdb_w2v[word].reshape((1, size))
	count += 1.
	except KeyError:
	continue
	if count != 0:
	vec /= count
	return vec

	from sklearn.preprocessing import scale
	train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_train_split])
	train_vecs = scale(train_vecs)

	#Train word2vec on test tweets
	imdb_w2v.train(X_test_split)

	#Build test tweet vectors then scale
	test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_test_split])
	test_vecs = scale(test_vecs)

	y_train = np.reshape(y_train, (len(y_train), 1))
	y_train = tf.concat(1, [1 - y_train, y_train])
	y_test = np.reshape(y_test, (len(y_test), 1))

	x = tf.placeholder(tf.float32, shape = [None, 300])
	y = tf.placeholder(tf.float32, shape = [None, 2])
	# Input -> Layer 1
	W1 = tf.Variable(tf.zeros([300, 2]))
	b1 = tf.Variable(tf.zeros([2]))
	#h1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
	# Calculating difference between label and output
	pred = tf.nn.softmax(tf.matmul(x, W1) + b1)
	cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred,y))
	train_step = tf.train.GradientDescentOptimizer(0.3).minimize(cost)

	with tf.Session() as sess:
	for i in xrange(200):
	init_op = tf.initialize_all_variables()
	sess.run(init_op)
	train_step.run(feed_dict = {x: train_vecs, y: y_train})
	correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
	# Calculate accuracy
	accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
	print "Accuracy:", accuracy.eval({x: test_vecs, y: y_test})