Skip to content

Instantly share code, notes, and snippets.

@philipturner
Last active June 25, 2024 19:41
Show Gist options
  • Save philipturner/da41ce2db6553dad16187a0cf329c5e9 to your computer and use it in GitHub Desktop.
Save philipturner/da41ce2db6553dad16187a0cf329c5e9 to your computer and use it in GitHub Desktop.
// MFA 1 (current implementation)
func generateBlockMask1() {
// 2 gigabytes of memory.
// - This memory bottleneck predates MFA.
// - Originated from the method of running naive attention in PyTorch:
// allocate a statically shaped tensor of dimension [seqlen x seqlen]
var attentionMask = [Float16](repeating: .zero, count: 32_000 * 32_000)
// On the CPU, s4nnc side.
// - Create the attention mask in CPU code, with an s4nnc API.
// - About 1 billion clock cycles.
// - Several hundred milliseconds of latency, likely has gone unnoticed
// since the Draw Things app first launched (there was masked attention
// in some part of SDv1?)
// - This latency bottleneck predates MFA.
//
// SERIAL FOR LOOP
//
for rowID in 0..<32_000 {
for columnID in 0..<32_000 {
// This block of code is invoked 1 billion times.
do {
var maskValue: Float16
if rowID <= columnID {
maskValue = 0
} else {
maskValue = -.infinity
}
let address = rowID * 32_000 + columnID
attentionMask[address] = maskValue
}
}
}
// 1 megabyte of memory.
let rowBlockSize = 32
let columnBlockSize = [32, 40, 56, 72].randomElement()!
var blockMask = [UInt8](
repeating: .zero,
count: (32_000 / rowBlockSize) * (32_000 / columnBlockSize))
// On the GPU, in the libNNC MFA backend.
// - Copy the blockmask to the compressed form.
// - Executes very fast, because the loop iterations are parallelized.
// - Primary reason for running on the GPU, is not performance, but actually
// ease of implementation. The libNNC MFA backend needs to match the block
// size of this mask to the actual attention kernel block size (32x32,
// 32x40, 32x56, or 32x72). There is no mechanism to elevate information
// about GPU kernel block size all the way from the libNNC backend to the
// s4nnc frontend API.
//
// - 200 microseconds of latency.
// - The overhead of dispatching one GPU kernel for something with a
// one-time cost is unjustified.
// - The overhead is so little, we actually dispatch this GPU kernel
// every single time attention is invoked! But we shouldn't have to.
//
// PARALLEL FOR LOOP
//
// Outer loop over threadgroups.
// 0..<1000
for rowBlockID in 0..<rowBlockSize {
let rowStart = rowBlockID * rowBlockSize
let rowEnd = (rowBlockID + 1) * rowBlockSize
// Outer loop over threadgroups.
// 0..<1000
for columnBlockID in 0..<columnBlockSize {
let columnStart = columnBlockID * columnBlockSize
let columnEnd = (columnBlockID + 1) * columnBlockSize
var hasInfinity = false
var hasZero = false
var hasOtherNumber = false
// Inner loop over threads.
// 0..<32, 32..<64 ... 31968..<32,000
// 0..<32_000 (if you were to execute them serially)
for rowID in rowStart..<rowEnd {
// Inner loop over threads.
// 0..<32, 32..<64 ... 31968..<32,000
// 0..<32_000 (if you were to execute them serially)
for columnID in columnStart..<columnEnd {
// This block of code is invoked 1 billion times.
do {
let address = rowID * 32_000 + columnID
let maskValue = attentionMask[address]
if maskValue == -.infinity {
hasInfinity = true
} else if maskValue == 0 {
hasZero = true
} else {
hasOtherNumber = true
}
}
}
}
// Choose the mask value.
var maskValue: UInt8
switch (hasInfinity, hasZero, hasOtherNumber) {
case (true, false, false):
// The GPU will perform 1 computation for this block, which is to
// read the mask value and discover it's zero.
maskValue = 0
case (false, true, false):
// The GPU will perform 32 * 32 * D computations for this block.
maskValue = 1
default:
// Some of the attention matrix elements need to be computed. The GPU
// will read from a small sub-region of the FULL ATTENTION MASK, a
// memory allocation spanning 1 billion elements.
maskValue = 2
}
let blockID = rowBlockID * 571 + columnBlockID
blockMask[blockID] = maskValue
}
}
// Attention kernel on the GPU.
//
// Dispatch 32,000 / 32 threadgroups.
// Each threadgroup runs 32,000 / (32 to 72) iterations.
for groupID in 0..<(32_000 / rowBlockSize) {
var registerAllocation = [Float16](repeating: .zero, count: 32 * 56)
for loopIterationID in 0..<(32_000 / columnBlockSize) {
let blockMaskAddress = groupID * columnBlockSize + loopIterationID
let blockMaskValue = blockMask[blockMaskAddress]
switch blockMaskValue {
case 0:
// This part of the code is reached in 49.9% of the iterations.
continue // skip forward to next iteration
case 1:
// This part of the code is reached in 49.9% of the iterations.
break // finish iteration
case 2:
// This part of the code is reached in 0.2% of the iterations.
let rowStart = groupID * rowBlockSize
let rowEnd = (groupID + 1) * rowBlockSize
let columnStart = loopIterationID * columnBlockSize
let columnEnd = (loopIterationID + 1) * columnBlockSize
// Run in parallel over threads.
for rowID in rowStart..<rowEnd {
for columnID in columnStart..<columnEnd {
let address = rowID * 32_000 + columnID
let registerID = (rowID - rowStart) * 56 + (columnID - columnStart)
registerAllocation[registerID] = attentionMask[address]
}
}
default:
fatalError("This should never happen.")
}
// Materialize a block of the attention matrix.
// ...
// Add the register allocation to the temporarily materialized block of
// the attention matrix.
// ...
}
}
}
// MFA 2 (fixes the bottlenecks in CPU latency and GPU memory allocation)
func generateBlockMask2() {
// Expose the block size to the CPU.
let rowBlockSize: Int = 32
let columnBlockSize: Int = 56 // Query the best block size from the NNC backend.
// 1_000 = 32_000 / 32
// 571 = 32_000 / 56
var blockCodes = [UInt8](repeating: .zero, count: 1_000 * 571)
var blockAddresses = [Int32](repeating: -1, count: 1_000 * 571)
var attentionMask: [Float16] = []
// On the CPU, s4nnc side.
// - Create the attention mask in CPU code, with an s4nnc API for
// programmable blockmasks.
// - About 2,500,000 clock cycles.
// - One millisecond of latency.
//
// SERIAL FOR LOOP
//
for rowBlockID in 0..<1_000 {
for columnBlockID in 0..<571 {
let rowStart = rowBlockID * rowBlockSize
let rowEnd = (rowBlockID + 1) * rowBlockSize
let columnStart = columnBlockID * columnBlockSize
let columnEnd = (columnBlockID + 1) * columnBlockSize
// This block of code is invoked 571,000 times.
do {
let blockID = rowBlockID * 571 + columnBlockID
if rowEnd < columnStart {
// This part of the code is reached in 49.9% of the iterations.
blockCodes[blockID] = 1
continue
}
if rowStart > columnEnd {
// This part of the code is reached in 49.9% of the iterations.
blockCodes[blockID] = 0
continue
}
// This part of the code is only reached in 0.2% of the iterations.
blockCodes[blockID] = 2
blockAddresses[blockID] = Int32(attentionMask.count)
}
// This block of code is invoked 1,142 times.
for rowID in rowStart..<rowEnd {
for columnID in columnStart..<columnEnd {
// This block of code is invoked 2 million times.
do {
var maskValue: Float16
if rowID <= columnID {
maskValue = 0
} else {
maskValue = -.infinity
}
let address = rowID * 32_000 + columnID
attentionMask.append(maskValue)
}
}
}
}
}
// 2 megabytes of memory
var blockMask = [UInt32](repeating: .zero, count: 1_000 * 571)
// On the CPU, in the libNNC MFA backend.
// - Execute on the CPU instead of the GPU.
// - Now, the MFA backend doesn't need to code an entire GPU kernel just
// to overcome an implementation issue.
// - 500,000 clock cycles.
// - 200 microseconds of latency.
// - One-time cost (we don't do this every time an attention kernel
// is dispatched, just when the attention mask is first created).
//
// SERIAL FOR LOOP
//
for rowBlockID in 0..<1_000 {
for columnBlockID in 0..<571 {
let blockID = rowBlockID * 571 + columnBlockID
let code = blockCodes[blockID]
// Should be -1 if no memory was allocated for this block.
var address = blockAddresses[blockID]
// We know the addresses within the array will be divisible by 32 * 56.
// If we divide the address by 32 * 56, we can store the same amount of
// information with less bits. Enough to compress the 32-bit integer to
// 24-bit.
address /= 32 * 56
let maskValue = UInt32(code) + UInt32(address &<< 8)
blockMask[blockID] = maskValue
}
}
// Attention kernel on the GPU.
//
// Dispatch 32,000 / 32 threadgroups.
// Each threadgroup runs 32,000 / (32 to 72) iterations.
//
// PARALLEL FOR LOOP
//
for groupID in 0..<(32_000 / rowBlockSize) {
var registerAllocation = [Float16](repeating: .zero, count: 32 * 56)
for loopIterationID in 0..<(32_000 / columnBlockSize) {
let blockMaskAddress = groupID * columnBlockSize + loopIterationID
let blockMaskValue = blockMask[blockMaskAddress]
switch UInt8(truncatingIfNeeded: blockMaskValue) {
case 0:
// This part of the code is reached in 49.9% of the iterations.
continue // skip forward to next iteration
case 1:
// This part of the code is reached in 49.9% of the iterations.
break // finish iteration
case 2:
// This part of the code is reached in 0.2% of the iterations.
let rowStart = groupID * rowBlockSize
let rowEnd = (groupID + 1) * rowBlockSize
let columnStart = loopIterationID * columnBlockSize
let columnEnd = (loopIterationID + 1) * columnBlockSize
// Run in parallel over threads.
for rowID in rowStart..<rowEnd {
for columnID in columnStart..<columnEnd {
// ============================================================== //
// THIS IS THE PART OF THE MFA2 CODEBASE THAT HAS CHANGED.
// (the other part being deletion of the generate_blockmask kernel)
// ============================================================== //
var address = blockMaskValue &>> 8
address *= 32 * 56
let registerID = (rowID - rowStart) * 56 + (columnID - columnStart)
address += UInt32(registerID)
registerAllocation[registerID] = attentionMask[Int(address)]
}
}
default:
fatalError("This should never happen.")
}
// Materialize a block of the attention matrix.
// ...
// Add the register allocation to the temporarily materialized block of
// the attention matrix.
// ...
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment