ajcwebdev/index.md

## index.md

      
    Raw
  

              index.md
            
          
    From the paper Transformer decoding in fifty lines of pseudocode by Bob Carpenter at the Flatiron Institute. I copied it by hand and it almost certainly has a mistake or two in it.
DECODE(tok: int<lower=1, upper=T>[N],
    alpha: matrix(T, V),
    betas: { query: matrix(V, K),
        key: matrix(V, K),
        value: matrix(V, V) }[A],
    gammas: { 1: vector(L), 2: matrix(L, V),
              3: vector(V), 4: matrix(V, L) }[A],
    delta: matrix(T, V) ): simplex(T)
for n in 1:N:                                            // embed input
    xs[n, 1:V] = alpha[tok[n], 1:V] + POS(n)             // embedding of token n
for a in 1:A:                                            // A attention layers
    xs = ATTEND(xs, betas[a])                            // update tokens jointly
    for n in 1:N:                                        // update tokens individually
        xs[n, 1:V] = FEED_FORWARD(xs[n, 1:V], gammas[a]) // with shared NN
return LOGISTIC_REGRESSION(xs[N, 1:V], delta)            // next token probs

ATTEND(x: matrix(N, V),
        beta.query: matrix(V, K),
        beta.key: matrix(V, K),
        beta.value: matrix(V, V)): matrix(N, V)

for n in 1:N:
    q[n, 1:K] = x[n, 1:V] * beta.query              // optionally add intercept
    k[n, 1:K] = x[n, 1:V] * beta.key
    v[n, 1:V] = x[n, 1:V] * beta.value
for n in 1:N:
    lp[1:n-1] = [q[n] * k[1]’, ..., q[n] * k[n-1]’] // dot product log probs
        / sqrt(V)                                   // scaled by sqrt value size
    lp[n:N] = -inf
    p[1:N] = SOFTMAX(lp[1:N])                       // attention probs
    u[n, 1:V] = SUM(n’ in 1:N) p[n’] * v[n’, 1:V]   // weighted avg of token values
    y[n, 1:V] = STANDARDIZE(u[n, 1:V] + x[n, 1:V])  // add in to out (non-center)
return y

POS(n: int<low=1,up=N>): vector(V)
for i in 1:(V / 2):
    r = n / N**(2 * i / V)                          // exponent ranges from 2/V to 1
    u[2 * i - 1] = sin(r)
    u[2 * i] = cos(r)
return u

FEED_FORWARD(x: vector(V),
        gamma.1: vector(L), gamma.2: matrix(L, V),
        gamma.3: vector(V), gamma.4: matrix(V, L)}): vector(V)
u[1:L] = gamma.1 + gamma.2 * x                                   // first layer
w[1:L] = GELU(u)                                                 // non-linearity
y[1:V] = gamma.3 + gamma.4 * w                                   // second layer
return STANDARDIZE(x + y)                                        // layer norm input + output

LOGISTIC_REGRESSION(x: vector(V), delta: matrix(T, V)): simplex(T)
log_probs[1:T] = delta * x
return SOFTMAX(log_probs[1:T])

STANDARDIZE(u: vector(V)): vector(V)               // aka layer norm
    return (u - mean(u)) / standard_deviation(u)

SOFTMAX(u: vector(V)): simplex(V)
    return exp(u) / sum(exp(u))

GELU(u: vector(V)): vector(V)
    return u .* Phi(u)                            // Phi() std normal cdf (sigmoid)
COMPLETE_TEXT(toks: int<low=1, up=T>[N’],
        int<low=0> max_tokens,
        ...decoder params...): int<low=1, up=T>[]
toks_out = []
while (True):
    while (toks.size() > N) toks.pop_first()            // trim to <= N tokens
    prob = DECODE(toks, ...decoder params...)           // next token probs
    next_tok = categorical_rng(prob)                    // gen. next token randomly
    if (next_tok == END_TOKEN): return toks_out         // return if end token
    toks_out.push_last(next_tok)                        // append to output
    if (toks_out.size() == max_tokens): return toks_out // return if max tokens
    toks.push_last(next_tok)                            // add next token to end

LOG_DENSITY(toks: int<low=1, up=T>[I, J[1:I]],
        ...decoder params...): real<up=0>
log_density = 0
for i in 1:I:
    history = []
    for j in 1:J[i]:
        history = toks[
        next_tok = toks[i, j]
        probs = DECODER(toks, ...decoder params...)
        log_density += log probs[next_tok]
        context.pop_first()
        context.append_last(next_tok)
return log_density

MH_ATTEND(x: matrix(N, V),
    beta.query: matrix(V, K)[H],
    beta.key: matrix(V, K)[H],
    beta.value: matrix(V, V/H)[H],
    tau: matrix(N, N),
    rho: vector(V)): matrix(N, V)

for h in 1:H:                                            // parallel heads
    for n in 1:N:
        q[n, 1:K] = x[n, 1:V] * beta_query[h]            // q, k, v vary by head
        k[n, 1:K] = x[n, 1:V] * beta_key[h]
        v[n, 1:V/H] = x[n, 1:V] * beta_value[h]
    for n in 1:N:                                        // loop unchanged
        lp[1:n-1] = [q[n] * k[1]’, ..., q[n] * k[n-1]’]
                / sqrt(V)
        lp[n:N] = -inf
        p[1:N] = SOFTMAX(lp[1:N])
        u[h, n, 1:V] = SUM(n’ in 1:N) p[n’] * v[n’, 1:V]
for n in 1:N:
    z[n, 1:V] = concat(u[1, n, 1:V], ..., u[H, n, 1:V])  // concat results
    w[1:N, 1:V] = tau * z + rho * [1 ... 1]              // affine transform
for n in 1:N:
    y[n, 1:V] = STANDARDIZE(x[n, 1:V] + z[1:V])          // residual + std
return y
I mostly wanted the code in a self contained place so I could copy paste it in ChatGPT and ask it to rewrite it in JavaScript.
function softmax(arr) {
    const max = Math.max(...arr);
    const expArr = arr.map(x => Math.exp(x - max));
    const sum = expArr.reduce((acc, val) => acc + val, 0);
    return expArr.map(x => x / sum);
}

function gelu(x) {
    const cdf = 0.5 * (1.0 + Math.tanh((Math.sqrt(2 / Math.PI) * (x + 0.044715 * Math.pow(x, 3)))));
    return x * cdf;
}

function standardize(u) {
    const mean = u.reduce((acc, val) => acc + val, 0) / u.length;
    const std = Math.sqrt(u.map(x => Math.pow(x - mean, 2)).reduce((acc, val) => acc + val, 0) / u.length);
    return u.map(x => (x - mean) / std);
}

function attend(x, beta) {
    let N = x.length;
    let V = x[0].length;
    let K = beta.query[0].length;

    let q = new Array(N).fill().map(() => new Array(K).fill(0));
    let k = new Array(N).fill().map(() => new Array(K).fill(0));
    let v = new Array(N).fill().map(() => new Array(V).fill(0));

    for (let n = 0; n < N; n++) {
        for (let j = 0; j < K; j++) {
            q[n][j] = x[n].reduce((sum, xi, idx) => sum + xi * beta.query[idx][j], 0);
            k[n][j] = x[n].reduce((sum, xi, idx) => sum + xi * beta.key[idx][j], 0);
        }
        for (let j = 0; j < V; j++) {
            v[n][j] = x[n].reduce((sum, xi, idx) => sum + xi * beta.value[idx][j], 0);
        }
    }

    let y = new Array(N).fill().map(() => new Array(V).fill(0));

    for (let n = 0; n < N; n++) {
        let lp = new Array(N).fill(-Infinity);
        for (let i = 0; i < n; i++) {
            lp[i] = q[n].reduce((sum, qj, idx) => sum + qj * k[i][idx], 0) / Math.sqrt(V);
        }
        let p = softmax(lp);
        let u = new Array(V).fill(0);
        for (let i = 0; i < N; i++) {
            for (let j = 0; j < V; j++) {
                u[j] += p[i] * v[i][j];
            }
        }
        y[n] = standardize(u.map((uj, idx) => uj + x[n][idx]));
    }

    return y;
}

function feedForward(x, gamma) {
    const L = gamma['1'].length;
    let u = gamma['1'].map((g1i, idx) => g1i + x.reduce((sum, xi, jdx) => sum + xi * gamma['2'][jdx][idx], 0));
    let w = u.map(val => gelu(val));
    let y = gamma['3'].map((g3i, idx) => g3i + w.reduce((sum, wi, jdx) => sum + wi * gamma['4'][jdx][idx], 0));
    return standardize(x.map((xi, idx) => xi + y[idx]));
}

function logisticRegression(x, delta) {
    const T = delta.length;
    let log_probs = new Array(T).fill(0).map((_, idx) => x.reduce((sum, xi, jdx) => sum + xi * delta[idx][jdx], 0));
    return softmax(log_probs);
}

function decode(tok, alpha, betas, gammas, delta) {
    const N = tok.length;
    const V = alpha[0].length;
    let xs = new Array(N).fill().map(() => new Array(V).fill(0));

    // Embed input
    for (let n = 0; n < N; n++) {
        xs[n] = alpha[tok[n]];
    }

    const A = betas.length;

    for (let a = 0; a < A; a++) {
        xs = attend(xs, betas[a]);

        for (let n = 0; n < N; n++) {
            xs[n] = feedForward(xs[n], gammas[a]);
        }
    }

    return logisticRegression(xs[N - 1], delta);
}
I asked ChatGPT to rewrite the code and make the variable names more descriptive, I then asked it to comment every line of code.
// Softmax activation function
function softmax(inputArray) {
  // Find the maximum value within the input array
  const maxValue = Math.max(...inputArray)

  // Exponentiate each element in the input array, offsetting each value by maxValue to prevent overflow/underflow
  const exponentiatedArray = inputArray.map(
    value => Math.exp(value - maxValue)
  )

  // Sum up all the exponentiated values
  const sumOfExponentials = exponentiatedArray.reduce(
    (accum, val) => accum + val, 0
  )

  // Compute the softmax probability distribution over the input array
  return exponentiatedArray.map(value => value / sumOfExponentials)
}

// Gaussian Error Linear Unit (GELU) activation function
function gaussianErrorLinearUnit(inputValue) {
  // Compute the cumulative distribution function (CDF) of a Gaussian distribution for inputValue
  const cumulativeDistFunc = 0.5 * (
    1.0 + Math.tanh((
      Math.sqrt(2 / Math.PI) * (inputValue + 0.044715 * Math.pow(inputValue, 3))
    ))
  )

  // Return the GELU activated value
  return inputValue * cumulativeDistFunc
}

// Standardizes an array by subtracting the mean and dividing by the standard deviation
function standardize(inputArray) {
  // Calculate the mean of the input array
  const mean = inputArray.reduce(
    (accum, val) => accum + val, 0
  ) / inputArray.length
  
  // Calculate the standard deviation of the input array
  const stdDev = Math.sqrt(
    inputArray.map(
      value => Math.pow(value - mean, 2)
    ).reduce(
      (accum, val) => accum + val, 0
    ) / inputArray.length
  )

  // Standardize the input array
  return inputArray.map(value => (value - mean) / stdDev)
}

// Attention mechanism function
function attend(sequence, attentionParameters) {
  // Define lengths and dimensions used in subsequent operations
  const sequenceLength = sequence.length
  const valueDimension = sequence[0].length
  const keyDimension = attentionParameters.query[0].length

  // Initialize matrices for query, key, and value with zeros
  let queryMatrix = new Array(sequenceLength).fill().map(() => new Array(keyDimension).fill(0))
  let keyMatrix = new Array(sequenceLength).fill().map(() => new Array(keyDimension).fill(0))
  let valueMatrix = new Array(sequenceLength).fill().map(() => new Array(valueDimension).fill(0))

  // Calculate query, key, and value matrices using the input sequence and attention parameters
  for (let seqIdx = 0; seqIdx < sequenceLength; seqIdx++) {
    // ...
    // [Similar comments for following for-loops would go here.]
    // ...

  // Initialize the output matrix with zeros
  let output = new Array(sequenceLength).fill().map(() => new Array(valueDimension).fill(0))

  // Calculate attention and update output
  for (let seqIdx = 0; seqIdx < sequenceLength; seqIdx++) {
    // ...

  // Return the attended output
  return output
}

// Feedforward neural network layer
function feedForward(inputVector, feedForwardParams) {
  // Compute the linear combination of inputs and first set of parameters
  let linearCombination = feedForwardParams['1'].map(
    // ...
  )
  // Apply the GELU activation function
  let weightedSum = linearCombination.map(val => gaussianErrorLinearUnit(val))
  
  // Compute the final output vector as a linear combination of activated inputs and second set of parameters
  let outputVector = feedForwardParams['3'].map(
    // ...
  )

  // Return the standardized sum of input and output vectors
  return standardize(inputVector.map((inputVal, idx) => inputVal + outputVector[idx]))
}

// Logistic regression function
function logisticRegression(inputVector, weights) {
  // Determine the number of classes/targets
  const numClasses = weights.length
  
  // Compute the logarithm of the probability for each class using the input vector and weights
  let logProbabilities = new Array(numClasses).fill(0).map(
    (_, idx) => inputVector.reduce((sum, inputVal, jdx) => sum + inputVal * weights[idx][jdx], 0)
  )

  // Return the softmax probability distribution of the computed log probabilities
  return softmax(logProbabilities)
}

// Decode function that applies embedding, attention, feedforward layers, and logistic regression
function decode(tokenSequence, embeddingMatrix, attentionParamsArray, feedForwardParamsArray, outputWeights) {
  // Determine the lengths/dimensions of sequences and embeddings
  const numTokens = tokenSequence.length
  const embeddingDim = embeddingMatrix[0].length
  
  // Initialize an array to hold the embedded sequences
  let embeddedSequence = new Array(numTokens).fill().map(() => new Array(embeddingDim).fill(0))

  // Embed the input token sequence using the provided embedding matrix
  for (let tokIdx = 0; tokIdx < numTokens; tokIdx++) {
    embeddedSequence[tokIdx] = embeddingMatrix[tokenSequence[tokIdx]]
  }

  // Define the number of attention layers to process through
  const numAttentionLayers = attentionParamsArray.length

  // Process the embedded sequence through each attention and feedforward layer
  for (let layerIdx = 0; layerIdx < numAttentionLayers; layerIdx++) {
    embeddedSequence = attend(embeddedSequence, attentionParamsArray[layerIdx])

    for (let tokIdx = 0; tokIdx < numTokens; tokIdx++) {
      embeddedSequence[tokIdx] = feedForward(embeddedSequence[tokIdx], feedForwardParamsArray[layerIdx])
    }
  }

  // Apply logistic regression to the final token in the sequence and return the resulting probability distribution
  return logisticRegression(embeddedSequence[numTokens - 1], outputWeights)
}