Last active
February 4, 2016 16:20
-
-
Save jesuisnicolasdavid/95ff6f20c07965574d10 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#import data | |
train_pos = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/train/pos/train_pos.txt", engine='python', sep='delimiter', header=None) | |
train_neg = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/train/neg/train_neg.txt", engine='python', sep='delimiter', header=None) | |
test_pos = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/test/pos/test_pos.txt", engine='python', sep='delimiter', header=None) | |
test_neg = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/test/neg/test_neg.txt", engine='python', sep='delimiter', header=None) | |
#transform data | |
train_pos[1] = 1 | |
train_neg[1] = 0 | |
test_pos[1] = 1 | |
test_neg[1] = 0 | |
x_tn = pd.concat([train_pos,train_neg]).reset_index(drop=True) | |
x_tt = pd.concat([test_pos,test_neg]).reset_index(drop=True) | |
X_train, y_train = x_tn[0], x_tn[1] | |
X_test, y_test = x_tt[0], x_tt[1] | |
X_train_split = filter(None, [X_train.strip(' ').split(' ') for X_train in X_train]) | |
X_test_split = filter(None, [X_test.strip(' ').split(' ') for X_test in X_test]) | |
n_dim = 300 | |
#Initialize model and build vocab | |
imdb_w2v = word2vec.Word2Vec(size=n_dim, min_count=10) | |
imdb_w2v.build_vocab(X_train_split) | |
#Train the model over train_reviews | |
imdb_w2v.train(X_train_split) | |
#Build word vector for training set by using the average value of all word vectors in the tweet, then scale | |
def buildWordVector(text, size): | |
vec = np.zeros(size).reshape((1, size)) | |
count = 0. | |
for word in text: | |
try: | |
vec += imdb_w2v[word].reshape((1, size)) | |
count += 1. | |
except KeyError: | |
continue | |
if count != 0: | |
vec /= count | |
return vec | |
from sklearn.preprocessing import scale | |
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_train_split]) | |
train_vecs = scale(train_vecs) | |
#Train word2vec on test tweets | |
imdb_w2v.train(X_test_split) | |
#Build test tweet vectors then scale | |
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_test_split]) | |
test_vecs = scale(test_vecs) | |
y_train = np.reshape(y_train, (len(y_train), 1)) | |
y_train = tf.concat(1, [1 - y_train, y_train]) | |
y_test = np.reshape(y_test, (len(y_test), 1)) | |
x = tf.placeholder(tf.float32, shape = [None, 300]) | |
y = tf.placeholder(tf.float32, shape = [None, 2]) | |
# Input -> Layer 1 | |
W1 = tf.Variable(tf.zeros([300, 2])) | |
b1 = tf.Variable(tf.zeros([2])) | |
#h1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1) | |
# Calculating difference between label and output | |
pred = tf.nn.softmax(tf.matmul(x, W1) + b1) | |
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred,y)) | |
train_step = tf.train.GradientDescentOptimizer(0.3).minimize(cost) | |
with tf.Session() as sess: | |
for i in xrange(200): | |
init_op = tf.initialize_all_variables() | |
sess.run(init_op) | |
train_step.run(feed_dict = {x: train_vecs, y: y_train}) | |
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) | |
# Calculate accuracy | |
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) | |
print "Accuracy:", accuracy.eval({x: test_vecs, y: y_test}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment