Last active
June 25, 2024 19:41
-
-
Save philipturner/da41ce2db6553dad16187a0cf329c5e9 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// MFA 1 (current implementation) | |
func generateBlockMask1() { | |
// 2 gigabytes of memory. | |
// - This memory bottleneck predates MFA. | |
// - Originated from the method of running naive attention in PyTorch: | |
// allocate a statically shaped tensor of dimension [seqlen x seqlen] | |
var attentionMask = [Float16](repeating: .zero, count: 32_000 * 32_000) | |
// On the CPU, s4nnc side. | |
// - Create the attention mask in CPU code, with an s4nnc API. | |
// - About 1 billion clock cycles. | |
// - Several hundred milliseconds of latency, likely has gone unnoticed | |
// since the Draw Things app first launched (there was masked attention | |
// in some part of SDv1?) | |
// - This latency bottleneck predates MFA. | |
// | |
// SERIAL FOR LOOP | |
// | |
for rowID in 0..<32_000 { | |
for columnID in 0..<32_000 { | |
// This block of code is invoked 1 billion times. | |
do { | |
var maskValue: Float16 | |
if rowID <= columnID { | |
maskValue = 0 | |
} else { | |
maskValue = -.infinity | |
} | |
let address = rowID * 32_000 + columnID | |
attentionMask[address] = maskValue | |
} | |
} | |
} | |
// 1 megabyte of memory. | |
let rowBlockSize = 32 | |
let columnBlockSize = [32, 40, 56, 72].randomElement()! | |
var blockMask = [UInt8]( | |
repeating: .zero, | |
count: (32_000 / rowBlockSize) * (32_000 / columnBlockSize)) | |
// On the GPU, in the libNNC MFA backend. | |
// - Copy the blockmask to the compressed form. | |
// - Executes very fast, because the loop iterations are parallelized. | |
// - Primary reason for running on the GPU, is not performance, but actually | |
// ease of implementation. The libNNC MFA backend needs to match the block | |
// size of this mask to the actual attention kernel block size (32x32, | |
// 32x40, 32x56, or 32x72). There is no mechanism to elevate information | |
// about GPU kernel block size all the way from the libNNC backend to the | |
// s4nnc frontend API. | |
// | |
// - 200 microseconds of latency. | |
// - The overhead of dispatching one GPU kernel for something with a | |
// one-time cost is unjustified. | |
// - The overhead is so little, we actually dispatch this GPU kernel | |
// every single time attention is invoked! But we shouldn't have to. | |
// | |
// PARALLEL FOR LOOP | |
// | |
// Outer loop over threadgroups. | |
// 0..<1000 | |
for rowBlockID in 0..<rowBlockSize { | |
let rowStart = rowBlockID * rowBlockSize | |
let rowEnd = (rowBlockID + 1) * rowBlockSize | |
// Outer loop over threadgroups. | |
// 0..<1000 | |
for columnBlockID in 0..<columnBlockSize { | |
let columnStart = columnBlockID * columnBlockSize | |
let columnEnd = (columnBlockID + 1) * columnBlockSize | |
var hasInfinity = false | |
var hasZero = false | |
var hasOtherNumber = false | |
// Inner loop over threads. | |
// 0..<32, 32..<64 ... 31968..<32,000 | |
// 0..<32_000 (if you were to execute them serially) | |
for rowID in rowStart..<rowEnd { | |
// Inner loop over threads. | |
// 0..<32, 32..<64 ... 31968..<32,000 | |
// 0..<32_000 (if you were to execute them serially) | |
for columnID in columnStart..<columnEnd { | |
// This block of code is invoked 1 billion times. | |
do { | |
let address = rowID * 32_000 + columnID | |
let maskValue = attentionMask[address] | |
if maskValue == -.infinity { | |
hasInfinity = true | |
} else if maskValue == 0 { | |
hasZero = true | |
} else { | |
hasOtherNumber = true | |
} | |
} | |
} | |
} | |
// Choose the mask value. | |
var maskValue: UInt8 | |
switch (hasInfinity, hasZero, hasOtherNumber) { | |
case (true, false, false): | |
// The GPU will perform 1 computation for this block, which is to | |
// read the mask value and discover it's zero. | |
maskValue = 0 | |
case (false, true, false): | |
// The GPU will perform 32 * 32 * D computations for this block. | |
maskValue = 1 | |
default: | |
// Some of the attention matrix elements need to be computed. The GPU | |
// will read from a small sub-region of the FULL ATTENTION MASK, a | |
// memory allocation spanning 1 billion elements. | |
maskValue = 2 | |
} | |
let blockID = rowBlockID * 571 + columnBlockID | |
blockMask[blockID] = maskValue | |
} | |
} | |
// Attention kernel on the GPU. | |
// | |
// Dispatch 32,000 / 32 threadgroups. | |
// Each threadgroup runs 32,000 / (32 to 72) iterations. | |
for groupID in 0..<(32_000 / rowBlockSize) { | |
var registerAllocation = [Float16](repeating: .zero, count: 32 * 56) | |
for loopIterationID in 0..<(32_000 / columnBlockSize) { | |
let blockMaskAddress = groupID * columnBlockSize + loopIterationID | |
let blockMaskValue = blockMask[blockMaskAddress] | |
switch blockMaskValue { | |
case 0: | |
// This part of the code is reached in 49.9% of the iterations. | |
continue // skip forward to next iteration | |
case 1: | |
// This part of the code is reached in 49.9% of the iterations. | |
break // finish iteration | |
case 2: | |
// This part of the code is reached in 0.2% of the iterations. | |
let rowStart = groupID * rowBlockSize | |
let rowEnd = (groupID + 1) * rowBlockSize | |
let columnStart = loopIterationID * columnBlockSize | |
let columnEnd = (loopIterationID + 1) * columnBlockSize | |
// Run in parallel over threads. | |
for rowID in rowStart..<rowEnd { | |
for columnID in columnStart..<columnEnd { | |
let address = rowID * 32_000 + columnID | |
let registerID = (rowID - rowStart) * 56 + (columnID - columnStart) | |
registerAllocation[registerID] = attentionMask[address] | |
} | |
} | |
default: | |
fatalError("This should never happen.") | |
} | |
// Materialize a block of the attention matrix. | |
// ... | |
// Add the register allocation to the temporarily materialized block of | |
// the attention matrix. | |
// ... | |
} | |
} | |
} | |
// MFA 2 (fixes the bottlenecks in CPU latency and GPU memory allocation) | |
func generateBlockMask2() { | |
// Expose the block size to the CPU. | |
let rowBlockSize: Int = 32 | |
let columnBlockSize: Int = 56 // Query the best block size from the NNC backend. | |
// 1_000 = 32_000 / 32 | |
// 571 = 32_000 / 56 | |
var blockCodes = [UInt8](repeating: .zero, count: 1_000 * 571) | |
var blockAddresses = [Int32](repeating: -1, count: 1_000 * 571) | |
var attentionMask: [Float16] = [] | |
// On the CPU, s4nnc side. | |
// - Create the attention mask in CPU code, with an s4nnc API for | |
// programmable blockmasks. | |
// - About 2,500,000 clock cycles. | |
// - One millisecond of latency. | |
// | |
// SERIAL FOR LOOP | |
// | |
for rowBlockID in 0..<1_000 { | |
for columnBlockID in 0..<571 { | |
let rowStart = rowBlockID * rowBlockSize | |
let rowEnd = (rowBlockID + 1) * rowBlockSize | |
let columnStart = columnBlockID * columnBlockSize | |
let columnEnd = (columnBlockID + 1) * columnBlockSize | |
// This block of code is invoked 571,000 times. | |
do { | |
let blockID = rowBlockID * 571 + columnBlockID | |
if rowEnd < columnStart { | |
// This part of the code is reached in 49.9% of the iterations. | |
blockCodes[blockID] = 1 | |
continue | |
} | |
if rowStart > columnEnd { | |
// This part of the code is reached in 49.9% of the iterations. | |
blockCodes[blockID] = 0 | |
continue | |
} | |
// This part of the code is only reached in 0.2% of the iterations. | |
blockCodes[blockID] = 2 | |
blockAddresses[blockID] = Int32(attentionMask.count) | |
} | |
// This block of code is invoked 1,142 times. | |
for rowID in rowStart..<rowEnd { | |
for columnID in columnStart..<columnEnd { | |
// This block of code is invoked 2 million times. | |
do { | |
var maskValue: Float16 | |
if rowID <= columnID { | |
maskValue = 0 | |
} else { | |
maskValue = -.infinity | |
} | |
let address = rowID * 32_000 + columnID | |
attentionMask.append(maskValue) | |
} | |
} | |
} | |
} | |
} | |
// 2 megabytes of memory | |
var blockMask = [UInt32](repeating: .zero, count: 1_000 * 571) | |
// On the CPU, in the libNNC MFA backend. | |
// - Execute on the CPU instead of the GPU. | |
// - Now, the MFA backend doesn't need to code an entire GPU kernel just | |
// to overcome an implementation issue. | |
// - 500,000 clock cycles. | |
// - 200 microseconds of latency. | |
// - One-time cost (we don't do this every time an attention kernel | |
// is dispatched, just when the attention mask is first created). | |
// | |
// SERIAL FOR LOOP | |
// | |
for rowBlockID in 0..<1_000 { | |
for columnBlockID in 0..<571 { | |
let blockID = rowBlockID * 571 + columnBlockID | |
let code = blockCodes[blockID] | |
// Should be -1 if no memory was allocated for this block. | |
var address = blockAddresses[blockID] | |
// We know the addresses within the array will be divisible by 32 * 56. | |
// If we divide the address by 32 * 56, we can store the same amount of | |
// information with less bits. Enough to compress the 32-bit integer to | |
// 24-bit. | |
address /= 32 * 56 | |
let maskValue = UInt32(code) + UInt32(address &<< 8) | |
blockMask[blockID] = maskValue | |
} | |
} | |
// Attention kernel on the GPU. | |
// | |
// Dispatch 32,000 / 32 threadgroups. | |
// Each threadgroup runs 32,000 / (32 to 72) iterations. | |
// | |
// PARALLEL FOR LOOP | |
// | |
for groupID in 0..<(32_000 / rowBlockSize) { | |
var registerAllocation = [Float16](repeating: .zero, count: 32 * 56) | |
for loopIterationID in 0..<(32_000 / columnBlockSize) { | |
let blockMaskAddress = groupID * columnBlockSize + loopIterationID | |
let blockMaskValue = blockMask[blockMaskAddress] | |
switch UInt8(truncatingIfNeeded: blockMaskValue) { | |
case 0: | |
// This part of the code is reached in 49.9% of the iterations. | |
continue // skip forward to next iteration | |
case 1: | |
// This part of the code is reached in 49.9% of the iterations. | |
break // finish iteration | |
case 2: | |
// This part of the code is reached in 0.2% of the iterations. | |
let rowStart = groupID * rowBlockSize | |
let rowEnd = (groupID + 1) * rowBlockSize | |
let columnStart = loopIterationID * columnBlockSize | |
let columnEnd = (loopIterationID + 1) * columnBlockSize | |
// Run in parallel over threads. | |
for rowID in rowStart..<rowEnd { | |
for columnID in columnStart..<columnEnd { | |
// ============================================================== // | |
// THIS IS THE PART OF THE MFA2 CODEBASE THAT HAS CHANGED. | |
// (the other part being deletion of the generate_blockmask kernel) | |
// ============================================================== // | |
var address = blockMaskValue &>> 8 | |
address *= 32 * 56 | |
let registerID = (rowID - rowStart) * 56 + (columnID - columnStart) | |
address += UInt32(registerID) | |
registerAllocation[registerID] = attentionMask[Int(address)] | |
} | |
} | |
default: | |
fatalError("This should never happen.") | |
} | |
// Materialize a block of the attention matrix. | |
// ... | |
// Add the register allocation to the temporarily materialized block of | |
// the attention matrix. | |
// ... | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment