Skip to content

Instantly share code, notes, and snippets.

View Sentimentron's full-sized avatar

Richard Townsend Sentimentron

View GitHub Profile
def build_model(tparams, options, maxw, training=True):
# Input variables
xc0 = tensor.tensor3('xc0', dtype='int8')
xc1 = tensor.tensor3('xc1', dtype='int8')
mask0 = tensor.tensor4('mask0', dtype=config.floatX)
mask1 = tensor.tensor4('mask1', dtype=config.floatX)
y = tensor.vector('y', dtype='int8')
y_mask = tensor.vector('y_mask', dtype='float32')
n_batch = xc0.shape[2]
@Sentimentron
Sentimentron / baseline.py
Created March 3, 2017 00:17
Baseline for Quora duplicate questions dataset
import csv
import numpy
import scipy
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
def train_test_split(x, y, test_size=0.05):
l = int(len(x) * test_size)
@Sentimentron
Sentimentron / process.js
Created November 10, 2016 23:59
First stage of the #Election2016 twitter map pipeline
var sqlite3 = require('sqlite3').verbose();
var fileExists = require('file-exists');
var dracula = require('dracula-sentiment');
var request = require('request');
// This is the database file written to by the collection script...
var databaseFile = "tweets.sqlite";
var db = new sqlite3.Database(databaseFile);
// ...and this is the database we'll generate.
var outDatabaseFile = "analysis.sqlite"
var createDatabase = !fileExists(outDatabaseFile);
@Sentimentron
Sentimentron / run.sh
Created November 10, 2016 23:32
Running `collect.js`
#!/bin/bash
# This is the “Consumer Key (API Key)” field on apps.twitter.com
export TWITTER_CONSUMER_KEY=""
# The is the “Consumer Secret (API Secret)” field on apps.twitter.com
export TWITTER_CONSUMER_SECRET=""
# This is the “Access Token” from the “Your Access Token” section
export TWITTER_ACCESS_TOKEN_KEY=""
# This is the “Access Token Secret” from the same section
export TWITTER_ACCESS_TOKEN_SECRET=""
@Sentimentron
Sentimentron / collect.js
Created November 10, 2016 23:22
A basic collection script for tracking a Twitter topic
// collect.js - Track a Twitter topic and dump it into a SQLite database
var sqlite3 = require('sqlite3').verbose();
var fileExists = require('file-exists');
var request = require('request');
var Twitter = require("twitter");
_ = require('lodash')
// If the output database doesn't exist, create it.
var databaseFile = "tweets.sqlite";
var createDatabase = !fileExists(databaseFile);
with tf.Graph().as_default():
tparams = init_tparams(params)
# Initialize the TensorFlow session
sm = tf.train.SessionManager(ready_op=tf.assert_variables_initialized())
# Create TensorFlow variables from the parameters
saver = tf.train.Saver(tparams)
# ...
sess = sm.prepare_session("", init_op=tf.initialize_all_variables(),
saver=saver)
# ... after completing a certain number of minibatches ...
def cost_fn(cur):
print options['ydim']
# TensorFlow does not support negative indexing WTF
# labels = cur[:, :, -1]
labels = cur[:, options['ydim']] # TODO: don't hard-code me
labels = tf.cast(labels, dtype='int32')
preds = cur[:, :options['ydim']]
# The non-sparse version of this: y
return tf.nn.sparse_softmax_cross_entropy_with_logits(preds, labels)
def cost_scan_i(i, j, free_var):
return -tensor.log(i[tensor.arange(n_batch), j] + 1e-8)
cost, _ = theano.scan(cost_scan_i, outputs_info=None, sequences=[pred, y, tensor.arange(n_batch)])
cost = cost.mean()
def lstm_layer(tparams, state_below, options, prefix='lstm', mask=None, go_backwards=False, mult=1):
nsteps = state_below.shape[0]
if state_below.ndim == 3:
n_samples = state_below.shape[1]
else:
n_samples = 1
assert mask is not None
def _slice(_x, n, dim):
def lstm_bidirectional_layer(tparams, state_below, options, prefix='lstm'):
def _p(pp, name):
return '%s_%s' % (pp, name)
lstm_fw_cell = rnn_cell.BasicLSTMCell(32, forget_bias=1.0)
lstm_bw_cell = rnn_cell.BasicLSTMCell(32, forget_bias=1.0) # TODO: don't hard-code me
lstm_fw_multicell = rnn_cell.MultiRNNCell([lstm_fw_cell]*options['letter_layers'])
lstm_bw_multicell = rnn_cell.MultiRNNCell([lstm_bw_cell]*options['letter_layers'])