Sandeep Subramanian MaximumEntropy

## conv_sentence_encoder.py
class DilatedConvSentenceEncoder(nn.Module):
    """A Sentence Encoder with Dilated Convs."""

    def __init__(
        self, input_dim=512, hidden_dim=4096, n_layers=7,
        dropout=0.5, batch_first=True
    ):
        """Initialize params."""
        super(DilatedConvSentenceEncoder, self).__init__()
        self.input_dim = input_dim

## iterator.py
class DataIterator(object):
    """Data Iterator."""

    def _trim_vocab(self, vocab, vocab_size):
        # Discard start, end, pad and unk tokens if already present
        if '<s>' in vocab:
            del vocab['<s>']
        if '<pad>' in vocab:
            del vocab['<pad>']
        if '</s>' in vocab:

## peephole_gru.py
class PeepholeGRU(nn.Module):
    """A Gated Recurrent Unit (GRU) cell with peepholes."""

    def __init__(
        self, input_dim, hidden_dim, n_layers,
        dropout=0., batch_first=True
    ):
        """Initialize params."""
        super(PeepholeGRU, self).__init__()
        self.input_dim = input_dim

## padded_rnn.py
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

x = Variable(torch.randn(10, 20, 30)).cuda()
lens = range(10)

x = pack_padded_sequence(x, lens[::-1], batch_first=True)

## pytorch_lstm.py
lstm = nn.LSTM(512, 512, 3).cuda()

x_val = Variable(torch.randn(200, 128, 512)).cuda()
y_val = Variable(torch.randn(200, 128, 512)).cuda()
h0_val = Variable(torch.randn(3, 128, 512)).cuda()
c0_val = Variable(torch.randn(3, 128, 512)).cuda()

for i in xrange(1000):
    output, (_, _) = lstm(x_val, (h0_val, c0_val))

## rnn_minibatch_generator.py
def get_minibatch(lines, index, batch_size, word2ind, max_len, add_start=False, add_end=True):
    """Prepare minibatch."""
    if add_start and add_end:
        lines = [
            ['<s>'] + line + ['</s>']
            for line in lines[index:index + batch_size]
        ]
    elif add_start and not add_end:
        lines = [
            ['<s>'] + line

## moses_tokenizer.py
import subprocess
import sys

tokenizer_path = sys.argv[1] # Path to the moses tokenizer mosesdecoder/scripts/tokenizer.perl
text = sys.argv[2] # Text to be tokenized
lang = sys.argv[3] # Input language ex: en, fr, de

pipe = subprocess.Popen(["perl", tokenizer_path, '-l', lang, text], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
pipe.stdin.write(text.encode('utf-8'))
pipe.stdin.close()
	class DilatedConvSentenceEncoder(nn.Module):
	"""A Sentence Encoder with Dilated Convs."""

	def __init__(
	self, input_dim=512, hidden_dim=4096, n_layers=7,
	dropout=0.5, batch_first=True
	):
	"""Initialize params."""
	super(DilatedConvSentenceEncoder, self).__init__()
	self.input_dim = input_dim
	class DataIterator(object):
	"""Data Iterator."""

	def _trim_vocab(self, vocab, vocab_size):
	# Discard start, end, pad and unk tokens if already present
	if '<s>' in vocab:
	del vocab['<s>']
	if '<pad>' in vocab:
	del vocab['<pad>']
	if '</s>' in vocab:
	class PeepholeGRU(nn.Module):
	"""A Gated Recurrent Unit (GRU) cell with peepholes."""

	def __init__(
	self, input_dim, hidden_dim, n_layers,
	dropout=0., batch_first=True
	):
	"""Initialize params."""
	super(PeepholeGRU, self).__init__()
	self.input_dim = input_dim
	import torch
	import torch.nn as nn
	from torch.autograd import Variable
	from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

	x = Variable(torch.randn(10, 20, 30)).cuda()
	lens = range(10)

	x = pack_padded_sequence(x, lens[::-1], batch_first=True)
	lstm = nn.LSTM(512, 512, 3).cuda()

	x_val = Variable(torch.randn(200, 128, 512)).cuda()
	y_val = Variable(torch.randn(200, 128, 512)).cuda()
	h0_val = Variable(torch.randn(3, 128, 512)).cuda()
	c0_val = Variable(torch.randn(3, 128, 512)).cuda()

	for i in xrange(1000):
	output, (_, _) = lstm(x_val, (h0_val, c0_val))
	def get_minibatch(lines, index, batch_size, word2ind, max_len, add_start=False, add_end=True):
	"""Prepare minibatch."""
	if add_start and add_end:
	lines = [
	['<s>'] + line + ['</s>']
	for line in lines[index:index + batch_size]
	]
	elif add_start and not add_end:
	lines = [
	['<s>'] + line
	import subprocess
	import sys

	tokenizer_path = sys.argv[1] # Path to the moses tokenizer mosesdecoder/scripts/tokenizer.perl
	text = sys.argv[2] # Text to be tokenized
	lang = sys.argv[3] # Input language ex: en, fr, de

	pipe = subprocess.Popen(["perl", tokenizer_path, '-l', lang, text], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
	pipe.stdin.write(text.encode('utf-8'))
	pipe.stdin.close()