Last active
May 14, 2017 02:12
-
-
Save keppel/19e2a095cbc08d26ee418c0b7bc16c23 to your computer and use it in GitHub Desktop.
evolving a character-level GRU in vanilla ndarray
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
evolving a character-level RNN (GRU) | |
in vanilla ndarray | |
*/ | |
let ndarray = require('ndarray') | |
let ops = require('ndarray-ops') | |
let nj = require('numjs') // just discovered this. todo: rewrite gru to use nj methods | |
let gaussian = require('gaussian') | |
let distribution = gaussian(0, 1) | |
let { gemv } = require('ndarray-blas-level2') | |
function unpack(arr, num) { | |
if (num > arr.length) { | |
throw new Error('tried to get too many parameters') | |
} | |
return arr.splice(arr.length - num, num) | |
} | |
function jitter(p, sigma) { | |
let noise = Array(p.length).fill(0).map(() => distribution.ppf(Math.random())) | |
let params = p.map((v, k) => v + sigma * noise[k]) | |
return { noise, params } | |
} | |
function buildModel(initialParams, hiddenSize, dictSize, gruParamSize) { | |
let params = initialParams.slice() | |
let rnn = gru(unpack(params, gruParamSize), hiddenSize, dictSize) | |
let w1 = ndarray(unpack(params, hiddenSize * dictSize), [ | |
dictSize, | |
hiddenSize | |
]) | |
return { rnn, w1 } | |
} | |
function forward(m, x, h0) { | |
let h1 = m.rnn(x, h0) | |
let y = softmax(ff(m.w1, h1)) | |
return [y, h1] | |
} | |
function ff(w, x, b) { | |
let y = ndarray(Array(w.shape[0]).fill(0)) | |
if (b) { | |
ops.assign(y, b) | |
} | |
gemv(1, w, x, 1, y) | |
return y | |
} | |
function gruInit(hiddenSize, xSize) { | |
return Array(6 * hiddenSize) | |
.fill(0) | |
.concat( | |
Array(3 * xSize * hiddenSize + 3 * hiddenSize * hiddenSize) | |
.fill(0) | |
.map( | |
() => distribution.ppf(Math.random()) / Math.sqrt(xSize * hiddenSize) | |
) | |
) | |
} | |
function gru(params, hiddenSize, xSize) { | |
// params required: 3(x * h) + 3(h * h) + 6(h) | |
let paramsRequired = | |
3 * xSize * hiddenSize + 3 * hiddenSize * hiddenSize + 6 * hiddenSize | |
let bir = ndarray(unpack(params, hiddenSize)) | |
let bin = ndarray(unpack(params, hiddenSize)) | |
let bii = ndarray(unpack(params, hiddenSize)) | |
let bhn = ndarray(unpack(params, hiddenSize)) | |
let bhr = ndarray(unpack(params, hiddenSize)) | |
let bhi = ndarray(unpack(params, hiddenSize)) | |
let Wir = ndarray(unpack(params, xSize * hiddenSize), [hiddenSize, xSize]) | |
let Whr = ndarray(unpack(params, hiddenSize * hiddenSize), [ | |
hiddenSize, | |
hiddenSize | |
]) | |
let Wii = ndarray(unpack(params, xSize * hiddenSize), [hiddenSize, xSize]) | |
let Whi = ndarray(unpack(params, hiddenSize * hiddenSize), [ | |
hiddenSize, | |
hiddenSize | |
]) | |
let Win = ndarray(unpack(params, xSize * hiddenSize), [hiddenSize, xSize]) | |
let Whn = ndarray(unpack(params, hiddenSize * hiddenSize), [ | |
hiddenSize, | |
hiddenSize | |
]) | |
return (x, h0) => { | |
// rt=sigmoid(Wirxt+bir+Whrh(t−1)+bhr) | |
let rt = ndarray(Array(hiddenSize).fill(0)) | |
ops.add(rt, ff(Wir, x), ff(Whr, h0), bir, bhr) | |
rt = sigmoid(rt) | |
// it=sigmoid(Wiixt+bii+Whih(t−1)+bhi) | |
let it = ndarray(Array(hiddenSize).fill(0)) | |
ops.add(it, ff(Wii, x), ff(Whi, h0), bii, bhi) | |
it = sigmoid(it) | |
// nt=tanh(Winxt+bin+rt∗(Whnh(t−1)+bhn)) | |
let nt = ndarray(Array(hiddenSize).fill(0)) | |
ops.mul(rt, rt, ff(Whn, h0, bhn)) | |
ops.add(nt, ff(Win, x), rt, bin) | |
nt = tanh(nt) | |
// ht=(1−it)∗nt+it∗h(t−1) | |
let h1 = ndarray(Array(hiddenSize).fill(0)) | |
h1.data.forEach((v, k) => { | |
h1.data[k] += (1 - it.data[k]) * nt.data[k] + it.data[k] * h0.data[k] | |
}) | |
return h1 | |
} | |
} | |
function tanh(ndarr) { | |
return ndarray(ndarr.data.slice().map(v => Math.tanh(v)), ndarr.shape) | |
} | |
function sigmoid(ndarr) { | |
return ndarray( | |
ndarr.data.slice().map(v => 1 / (1 + Math.exp(-v))), | |
ndarr.shape | |
) | |
} | |
const EOS = '∆' | |
function buildDictionaries(corpus) { | |
let charactersSeen = new Set() | |
let charToIndex = { [EOS]: 0 } | |
let indexToChar = { 0: EOS } | |
charactersSeen.add(EOS) | |
for (let i = 0; i < corpus.length; i++) { | |
if (!charactersSeen.has(corpus[i])) { | |
charToIndex[corpus[i]] = charactersSeen.size | |
indexToChar[charactersSeen.size] = corpus[i] | |
} | |
charactersSeen.add(corpus[i]) | |
} | |
return { charToIndex, indexToChar, size: charactersSeen.size } | |
} | |
function onehot(size, index) { | |
let embedded = ndarray(Array(size).fill(0)) | |
embedded.set(index, 1) | |
return embedded | |
} | |
function softmax(nd) { | |
let expSum = 0 | |
nd.data.forEach(n => { | |
expSum += Math.exp(n) | |
}) | |
let softmaxedValues = nd.data.map(n => Math.exp(n) / expSum) | |
return ndarray(softmaxedValues) | |
} | |
// let data = | |
// 'how does a bastard, orphan, son of a whore and a scotsman, dropped in the middle of a forgotten spot' | |
let data = 'generate me' | |
// let data = 'yolo' | |
// let data = '1234' | |
let dicts = buildDictionaries(data) | |
let hiddenSize = 10 | |
let gruInitialParams = gruInit(hiddenSize, dicts.size) | |
let initialParams = Array(hiddenSize * dicts.size) | |
.fill(0) | |
.map(() => distribution.ppf(Math.random())) | |
.concat(gruInitialParams) | |
function fitness(params) { | |
let model = buildModel( | |
params, | |
hiddenSize, | |
dicts.size, | |
gruInitialParams.length | |
) | |
let h = ndarray(Array(hiddenSize).fill(0)) | |
let score = 0 | |
let text = data + EOS | |
for (let i = 0; i < text.length - 1; i++) { | |
// off-by-one judd, fix this | |
let index = dicts.charToIndex[text[i]] | |
let indexNext = dicts.charToIndex[text[i + 1]] | |
let x = onehot(dicts.size, index) | |
let out = forward(model, x, h) | |
y = out[0] | |
h = out[1] | |
score += Math.log(y.get(indexNext)) | |
} | |
return score / text.length | |
} | |
function adam(beta1 = 0.9, beta2 = 0.999, eps = 1e-8) { | |
let m = [] | |
let v = [] | |
return (gradient, key) => { | |
m[key] = beta1 * (m[key] || 0) + (1 - beta1) * gradient | |
v[key] = beta2 * (v[key] || 0) + (1 - beta2) * Math.pow(gradient, 2) | |
return m[key] / (Math.sqrt(v[key]) + eps) | |
} | |
} | |
let headParams = initialParams.slice() | |
let normalizationWindow = 50 | |
let sigma = 0.01 | |
let alpha = 0.001 | |
let mean = 0 | |
let sd = 1e-8 | |
let optimizer = adam() | |
for (let i = 0; i < 50000; i++) { | |
let { params, noise } = jitter(headParams, sigma) | |
let score = fitness(params) | |
const n = 1 / normalizationWindow | |
mean = (1 - n) * mean + n * score | |
sd = (1 - n) * sd + n * Math.pow(score - mean, 2) | |
let advantage = (score - mean) / (sd + 1e-8) | |
if (i > normalizationWindow) { | |
headParams.forEach((v, k) => { | |
headParams[k] += | |
alpha * | |
optimizer(advantage * noise[k], k) / | |
(sigma * normalizationWindow) | |
// weight decay | |
// headParams[k] *= 1 - 1e-5 | |
}) | |
} | |
if (!(i % 100)) { | |
console.log(mean) | |
} | |
} | |
function sample(params, seed) { | |
let model = buildModel( | |
params, | |
hiddenSize, | |
dicts.size, | |
gruInitialParams.length | |
) | |
let maxLength = 120 | |
let h = ndarray(Array(hiddenSize).fill(0)) | |
let generated = seed | |
while ( | |
generated[generated.length - 1] !== EOS && generated.length < maxLength | |
) { | |
let index = dicts.charToIndex[generated[generated.length - 1]] | |
let x = onehot(dicts.size, index) | |
let out = forward(model, x, h) | |
y = out[0] | |
h = out[1] | |
generated += dicts.indexToChar[ops.argmax(y)] | |
} | |
return generated | |
} | |
console.log(sample(headParams, 'g')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment