Skip to content

Instantly share code, notes, and snippets.

@merckxiaan

merckxiaan/fra.txt

Last active Dec 20, 2018
Embed
What would you like to do?
Stay still. Ne bouge pas.
Step aside. Écarte-toi !
Step aside. Écartez-vous !
Stop lying. Arrête de mentir.
Stop lying. Arrêtez de mentir.
Study hard. Étudie avec application.
Study hard. Étudiez avec application.
Take a bus. Prenez un bus.
Take a nap. Fais un petit somme !
Take cover! Mets-toi à l'abri !
Take cover! Mettez-vous à l'abri !
Take notes. Prends des notes.
Take notes. Prenez des notes.
Talk to me! Parle-moi !
Talk to me! Parle-moi !
Talk to me! Parlez-moi !
Talk to me. Parle avec moi.
Talk to me. Parlez avec moi.
Taste this. Essayez ceci !
Taste this. Goûte ça.
Taste this. Goûtez ceci.
That a boy! C'est bien !
That a boy! T'es un bon garçon !
That hurts. Ça fait mal.
That works. Ça fonctionne.
That's all. C'est tout.
That's fun. C'est amusant.
That's fun. C'est marrant.
That's her. C'est elle.
That's his. Ce sont les siens.
That's his. C'est le sien.
That's his. C'est la sienne.
That's odd. C'est bizarre.
They agree. Ils sont d'accord.
They agree. Elles sont d'accord.
They cheat. Ils trichent.
They cheat. Elles trichent.
They voted. Ils ont voté.
They voted. Elles ont voté.
This is it. Ça y est.
This works. Ça fonctionne.
Time flies. Le temps s'enfuit.
Time flies. Le temps s'envole.
Time is up. Le temps est écoulé.
Time is up. L'heure est passée.
Tom agrees. Tom est d'accord.
Tom cheats. Tom triche.
Tom drinks. Tom boit.
Tom drives. Tom conduit.
Tom forgot. Tom a oublié.
#fra.txt contains only a very small subset of the data,
#link to download all data: "http://www.manythings.org/anki/fra-eng.zip"
FILE = "./fra.txt"
MAX_LENGTH = 10
HIDDEN = 64
BATCH_SIZE = 16
LEARNING_RATE = 0.001
DROPOUT = 0.2
using CuArrays, Flux, Statistics, Random
mutable struct Lang
name
word2index
word2count
index2word
n_words
end
Lang(name) = Lang(
name,
Dict{String, Int}(),
Dict{String, Int}(),
Dict{Int, String}(1=>"SOS", 2=>"EOS", 3=>"UNK", 4=>"PAD"),
4)
function (l::Lang)(sentence::String)
for word in split(sentence, " ")
#if l.n_words < 1000
if word keys(l.word2index)
l.word2index[word] = l.n_words + 1
l.word2count[word] = 1
l.index2word[l.n_words + 1] = word
l.n_words += 1
else
l.word2count[word] += 1
end
#end
end
end
function normalizeString(s)
s = strip(lowercase(s))
s = replace(s, r"([.!?,])"=>s" \1")
s = replace(s, "'"=>" ' ")
return s
end
function readLangs(lang1, lang2; rev=false)
println("Reading lines...")
lines = readlines(FILE)
pairs = [normalizeString.(pair) for pair in split.(lines, "\t")]
if rev
pairs = reverse.(pairs)
input_lang = Lang(lang2)
output_lang = Lang(lang1)
else
input_lang = Lang(lang1)
output_lang = Lang(lang2)
end
return(input_lang, output_lang, pairs)
end
eng_prefixes = [
"i am ", "i ' m ",
"he is ", "he ' s ",
"she is ", "she ' s ",
"you are ", "you ' re ",
"we are ", "we ' re ",
"they are ", "they ' re "]
function filterPair(p)
return(false (length.(split.(p, " ")) .<= MAX_LENGTH) && true (startswith.(p[1], eng_prefixes)))
end
function prepareData(lang1, lang2; rev=false)
input_lang, output_lang, pairs = readLangs(lang1, lang2; rev=rev)
println("Read $(length(pairs)) sentence pairs.")
pairs = [pair for pair in pairs if filterPair(pair)]
println("Trimmed to $(length(pairs)) sentence pairs.\n")
xs = []
ys = []
for pair in pairs
push!(xs, pair[1])
push!(ys, pair[2])
end
println("Counting words...")
for pair in pairs
input_lang(pair[2])
output_lang(pair[1])
end
println("Counted words:")
println("", input_lang.name, ": ", input_lang.n_words)
println("", output_lang.name, ": ", output_lang.n_words)
return(input_lang, output_lang, xs, ys)
end
fr, eng, xs, ys = prepareData("fr", "eng");
indexesFromSentence(lang, sentence) = append!(get.(Ref(lang.word2index), split(lowercase(sentence), " "), 3), 2)
indices = shuffle([1:length(xs)...])
xs = xs[indices]
ys = ys[indices]
function batch(data, batch_size, voc_size; gpu=true)
batches = Iterators.partition(data, batch_size)
out = []
for batch in batches
max_length = maximum(length.(batch))
batch = map(sentence->append!(sentence, fill(4, max_length-length(sentence))), batch)
batch = hcat(reshape.(batch, :, 1)...)
batch_out = []
for i in 1:size(batch, 1)
if gpu
push!(batch_out, cu(Flux.onehotbatch(batch[i, :], [1:voc_size...])))
else
push!(batch_out, Flux.onehotbatch(batch[i, :], [1:voc_size...]))
end
end
push!(out, batch_out)
end
return(out)
end
x, y = batch.([indexesFromSentence.([eng], xs), indexesFromSentence.([fr], ys)], [BATCH_SIZE], [eng.n_words, fr.n_words]; gpu=true)
struct Encoder
embedding
dropout
forward
backward
output
end
Encoder(voc_size::Integer; h_size::Integer=HIDDEN, dropout::Number=DROPOUT) = Encoder(
param(Flux.glorot_uniform(h_size, voc_size)),
Dropout(dropout),
GRU(h_size, h_size),
GRU(h_size, h_size),
Dense(2*h_size, h_size))
function (e::Encoder)(x)
x = map(x->e.embedding*x, x)
return e.output.(vcat.(e.forward.(x), Flux.flip(e.backward, x)))
return(x)
end
Flux.@treelike Encoder
struct Decoder
embedding
attention
rnn
output
end
Decoder(h_size, voc_size) = Decoder(
param(Flux.glorot_uniform(h_size, voc_size)),
Attention(h_size),
GRU(h_size*2, h_size),
Dense(h_size, voc_size, relu))
function (d::Decoder)(x, encoder_outputs; dropout=0)
x = d.embedding * x
x = Dropout(dropout)(x)
decoder_state = d.rnn.state
context = d.attention(encoder_outputs, decoder_state)
x = d.rnn([x; context])
x = softmax(d.output(x))
return(x)
end
Flux.@treelike Decoder
struct Attention
linear
end
Attention(h_size::Int) = Attention(Dense(2*h_size, 1, tanh))
function (a::Attention)(encoder_outputs, decoder_state)
weights = []
results = []
for word in encoder_outputs
weight = a.linear([word; decoder_state])
push!(weights, weight)
end
weights = softmax(vcat(weights...))
return sum([encoder_outputs[i].*weights[i, :]' for i in 1:size(weights, 1)])
end
Flux.@treelike Attention
function model(encoder::Encoder, decoder::Decoder, x, y; teacher_forcing = 0.5, dropout=DROPOUT, voc_size=fr.n_words)
total_loss = 0
max_length = length(y)
batch_size = size(x[1], 2)
Flux.reset!.([encoder, decoder])
encoder_outputs = encoder(x)
decoder_input = Flux.onehotbatch(ones(batch_size), [1:voc_size...])
decoder.rnn.state = encoder_outputs[end]
for i in 1:max_length
use_teacher_forcing = rand() < teacher_forcing
decoder_output = decoder(decoder_input, encoder_outputs; dropout=dropout)
total_loss += loss(decoder_output, y[i])
if use_teacher_forcing
decoder_input = y[i]
else
decoder_input = Flux.onehotbatch(Flux.onecold(decoder_output.data), [1:voc_size...])
end
end
return(total_loss)
end
model(x, y) = model(testEncoder, testDecoder, x, y; dropout = 0.05)
function model(encoder::Encoder, decoder::Decoder, x; reset=true, voc_size=fr.n_words)
result = []
if reset Flux.reset!.([encoder, decoder]) end
encoder_outputs = encoder(x)
decoder_input = Flux.onehot(1, [1:voc_size...])
decoder.rnn.state = encoder_outputs[end]
for i in 1:12
decoder_output = Flux.onecold(decoder(decoder_input, encoder_outputs))
@show decoder_output
if decoder_output[1] == 2 break end
push!(result, decoder_output...)
end
return(result)
end
loss = function(x, y)
losses = []
for i in 1:size(x, 2)
index = y[:, i].ix
if index != 4
push!(losses, -log(x[index, i]))
end
end
return(mean(losses))
end
testEncoder = Encoder(eng.n_words)|>gpu
testDecoder = Decoder(HIDDEN, fr.n_words)|>gpu
Flux.reset!.([testEncoder, testDecoder])
opt = SGD(params(testEncoder, testDecoder), LEARNING_RATE)
function partitionTrainTest(x, y, at = 0.7)
n = length(x)
idx = shuffle(1:n)
train_idx = view(idx, 1:floor(Int, at*n))
test_idx = view(idx, (floor(Int, at*n)+1):n)
train_x, test_x = x[train_idx,:], x[test_idx,:]
train_y, test_y = y[train_idx,:], y[test_idx,:]
return(train_x, train_y, test_x, test_y)
end
train_x, train_y, test_x, test_y = partitionTrainTest(x, y, 0.90)
for i in 1:5
Flux.train!(model, zip(train_x, train_y), opt)
println("loss: ", mean(model.(test_x, test_y)).data)
end
function predict(encoder, decoder, sentence::String)
sentence = normalizeString(sentence)
input = append!(get.(Ref(eng.word2index), split(lowercase(sentence), " "), 3), 2)
@show input
input = [Flux.onehot(word, [1:eng.n_words...]) for word in input]
output = model(encoder, decoder, input)
output = get.(Ref(fr.index2word), output, "UNK")
end
predict(testEncoder, testDecoder, "he is skinny.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment