Skip to content

Instantly share code, notes, and snippets.

@jesuisnicolasdavid
Last active February 4, 2016 16:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jesuisnicolasdavid/95ff6f20c07965574d10 to your computer and use it in GitHub Desktop.
Save jesuisnicolasdavid/95ff6f20c07965574d10 to your computer and use it in GitHub Desktop.
#import data
train_pos = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/train/pos/train_pos.txt", engine='python', sep='delimiter', header=None)
train_neg = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/train/neg/train_neg.txt", engine='python', sep='delimiter', header=None)
test_pos = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/test/pos/test_pos.txt", engine='python', sep='delimiter', header=None)
test_neg = pd.read_csv("/Users/nicolas/work/Dataset/imdb_sentiment/imdb_sentiment_text/test/neg/test_neg.txt", engine='python', sep='delimiter', header=None)
#transform data
train_pos[1] = 1
train_neg[1] = 0
test_pos[1] = 1
test_neg[1] = 0
x_tn = pd.concat([train_pos,train_neg]).reset_index(drop=True)
x_tt = pd.concat([test_pos,test_neg]).reset_index(drop=True)
X_train, y_train = x_tn[0], x_tn[1]
X_test, y_test = x_tt[0], x_tt[1]
X_train_split = filter(None, [X_train.strip(' ').split(' ') for X_train in X_train])
X_test_split = filter(None, [X_test.strip(' ').split(' ') for X_test in X_test])
n_dim = 300
#Initialize model and build vocab
imdb_w2v = word2vec.Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(X_train_split)
#Train the model over train_reviews
imdb_w2v.train(X_train_split)
#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in text:
try:
vec += imdb_w2v[word].reshape((1, size))
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
from sklearn.preprocessing import scale
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_train_split])
train_vecs = scale(train_vecs)
#Train word2vec on test tweets
imdb_w2v.train(X_test_split)
#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_test_split])
test_vecs = scale(test_vecs)
y_train = np.reshape(y_train, (len(y_train), 1))
y_train = tf.concat(1, [1 - y_train, y_train])
y_test = np.reshape(y_test, (len(y_test), 1))
x = tf.placeholder(tf.float32, shape = [None, 300])
y = tf.placeholder(tf.float32, shape = [None, 2])
# Input -> Layer 1
W1 = tf.Variable(tf.zeros([300, 2]))
b1 = tf.Variable(tf.zeros([2]))
#h1 = tf.nn.sigmoid(tf.matmul(x, W1) + b1)
# Calculating difference between label and output
pred = tf.nn.softmax(tf.matmul(x, W1) + b1)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred,y))
train_step = tf.train.GradientDescentOptimizer(0.3).minimize(cost)
with tf.Session() as sess:
for i in xrange(200):
init_op = tf.initialize_all_variables()
sess.run(init_op)
train_step.run(feed_dict = {x: train_vecs, y: y_train})
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print "Accuracy:", accuracy.eval({x: test_vecs, y: y_test})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment