This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class DilatedConvSentenceEncoder(nn.Module): | |
"""A Sentence Encoder with Dilated Convs.""" | |
def __init__( | |
self, input_dim=512, hidden_dim=4096, n_layers=7, | |
dropout=0.5, batch_first=True | |
): | |
"""Initialize params.""" | |
super(DilatedConvSentenceEncoder, self).__init__() | |
self.input_dim = input_dim |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class DataIterator(object): | |
"""Data Iterator.""" | |
def _trim_vocab(self, vocab, vocab_size): | |
# Discard start, end, pad and unk tokens if already present | |
if '<s>' in vocab: | |
del vocab['<s>'] | |
if '<pad>' in vocab: | |
del vocab['<pad>'] | |
if '</s>' in vocab: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class PeepholeGRU(nn.Module): | |
"""A Gated Recurrent Unit (GRU) cell with peepholes.""" | |
def __init__( | |
self, input_dim, hidden_dim, n_layers, | |
dropout=0., batch_first=True | |
): | |
"""Initialize params.""" | |
super(PeepholeGRU, self).__init__() | |
self.input_dim = input_dim |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
import torch.nn as nn | |
from torch.autograd import Variable | |
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence | |
x = Variable(torch.randn(10, 20, 30)).cuda() | |
lens = range(10) | |
x = pack_padded_sequence(x, lens[::-1], batch_first=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
lstm = nn.LSTM(512, 512, 3).cuda() | |
x_val = Variable(torch.randn(200, 128, 512)).cuda() | |
y_val = Variable(torch.randn(200, 128, 512)).cuda() | |
h0_val = Variable(torch.randn(3, 128, 512)).cuda() | |
c0_val = Variable(torch.randn(3, 128, 512)).cuda() | |
for i in xrange(1000): | |
output, (_, _) = lstm(x_val, (h0_val, c0_val)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_minibatch(lines, index, batch_size, word2ind, max_len, add_start=False, add_end=True): | |
"""Prepare minibatch.""" | |
if add_start and add_end: | |
lines = [ | |
['<s>'] + line + ['</s>'] | |
for line in lines[index:index + batch_size] | |
] | |
elif add_start and not add_end: | |
lines = [ | |
['<s>'] + line |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import subprocess | |
import sys | |
tokenizer_path = sys.argv[1] # Path to the moses tokenizer mosesdecoder/scripts/tokenizer.perl | |
text = sys.argv[2] # Text to be tokenized | |
lang = sys.argv[3] # Input language ex: en, fr, de | |
pipe = subprocess.Popen(["perl", tokenizer_path, '-l', lang, text], stdin=subprocess.PIPE, stdout=subprocess.PIPE) | |
pipe.stdin.write(text.encode('utf-8')) | |
pipe.stdin.close() |