philipturner/BlockSparseAttentionMask.swift

## BlockSparseAttentionMask.swift
// MFA 1 (current implementation)
func generateBlockMask1() {
  // 2 gigabytes of memory.
  // - This memory bottleneck predates MFA.
  // - Originated from the method of running naive attention in PyTorch:
  //   allocate a statically shaped tensor of dimension [seqlen x seqlen]
  var attentionMask = [Float16](repeating: .zero, count: 32_000 * 32_000)

  // On the CPU, s4nnc side.
  // - Create the attention mask in CPU code, with an s4nnc API.
  // - About 1 billion clock cycles.
  // - Several hundred milliseconds of latency, likely has gone unnoticed
  //   since the Draw Things app first launched (there was masked attention
  //   in some part of SDv1?)
  // - This latency bottleneck predates MFA.
  //
  // SERIAL FOR LOOP
  //
  for rowID in 0..<32_000 {
    for columnID in 0..<32_000 {
      // This block of code is invoked 1 billion times.
      do {
        var maskValue: Float16
        if rowID <= columnID {
          maskValue = 0
        } else {
          maskValue = -.infinity
        }
        let address = rowID * 32_000 + columnID
        attentionMask[address] = maskValue
      }
    }
  }

  // 1 megabyte of memory.
  let rowBlockSize = 32
  let columnBlockSize = [32, 40, 56, 72].randomElement()!
  var blockMask = [UInt8](
    repeating: .zero,
    count: (32_000 / rowBlockSize) * (32_000 / columnBlockSize))

  // On the GPU, in the libNNC MFA backend.
  // - Copy the blockmask to the compressed form.
  // - Executes very fast, because the loop iterations are parallelized.
  // - Primary reason for running on the GPU, is not performance, but actually
  //   ease of implementation. The libNNC MFA backend needs to match the block
  //   size of this mask to the actual attention kernel block size (32x32,
  //   32x40, 32x56, or 32x72). There is no mechanism to elevate information
  //   about GPU kernel block size all the way from the libNNC backend to the
  //   s4nnc frontend API.
  //
  // - 200 microseconds of latency.
  // - The overhead of dispatching one GPU kernel for something with a
  //   one-time cost is unjustified.
  // - The overhead is so little, we actually dispatch this GPU kernel
  //   every single time attention is invoked! But we shouldn't have to.
  //
  // PARALLEL FOR LOOP
  //
  // Outer loop over threadgroups.
  // 0..<1000
  for rowBlockID in 0..<rowBlockSize {
    let rowStart = rowBlockID * rowBlockSize
    let rowEnd = (rowBlockID + 1) * rowBlockSize

    // Outer loop over threadgroups.
    // 0..<1000
    for columnBlockID in 0..<columnBlockSize {
      let columnStart = columnBlockID * columnBlockSize
      let columnEnd = (columnBlockID + 1) * columnBlockSize

      var hasInfinity = false
      var hasZero = false
      var hasOtherNumber = false

      // Inner loop over threads.
      // 0..<32, 32..<64 ... 31968..<32,000
      // 0..<32_000 (if you were to execute them serially)
      for rowID in rowStart..<rowEnd {
        // Inner loop over threads.
        // 0..<32, 32..<64 ... 31968..<32,000
        // 0..<32_000 (if you were to execute them serially)
        for columnID in columnStart..<columnEnd {
          // This block of code is invoked 1 billion times.
          do {
            let address = rowID * 32_000 + columnID
            let maskValue = attentionMask[address]

            if maskValue == -.infinity {
              hasInfinity = true
            } else if maskValue == 0 {
              hasZero = true
            } else {
              hasOtherNumber = true
            }
          }
        }
      }

      // Choose the mask value.
      var maskValue: UInt8
      switch (hasInfinity, hasZero, hasOtherNumber) {
      case (true, false, false):
        // The GPU will perform 1 computation for this block, which is to
        // read the mask value and discover it's zero.
        maskValue = 0
      case (false, true, false):
        // The GPU will perform 32 * 32 * D computations for this block.
        maskValue = 1
      default:
        // Some of the attention matrix elements need to be computed. The GPU
        // will read from a small sub-region of the FULL ATTENTION MASK, a
        // memory allocation spanning 1 billion elements.
        maskValue = 2
      }
      let blockID = rowBlockID * 571 + columnBlockID
      blockMask[blockID] = maskValue
    }
  }

  // Attention kernel on the GPU.
  //
  // Dispatch 32,000 / 32 threadgroups.
  // Each threadgroup runs 32,000 / (32 to 72) iterations.
  for groupID in 0..<(32_000 / rowBlockSize) {
    var registerAllocation = [Float16](repeating: .zero, count: 32 * 56)

    for loopIterationID in 0..<(32_000 / columnBlockSize) {
      let blockMaskAddress = groupID * columnBlockSize + loopIterationID
      let blockMaskValue = blockMask[blockMaskAddress]

      switch blockMaskValue {
      case 0:
        // This part of the code is reached in 49.9% of the iterations.
        continue // skip forward to next iteration
      case 1:
        // This part of the code is reached in 49.9% of the iterations.
        break // finish iteration
      case 2:
        // This part of the code is reached in 0.2% of the iterations.
        let rowStart = groupID * rowBlockSize
        let rowEnd = (groupID + 1) * rowBlockSize
        let columnStart = loopIterationID * columnBlockSize
        let columnEnd = (loopIterationID + 1) * columnBlockSize

        // Run in parallel over threads.
        for rowID in rowStart..<rowEnd {
          for columnID in columnStart..<columnEnd {
            let address = rowID * 32_000 + columnID
            let registerID = (rowID - rowStart) * 56 + (columnID - columnStart)
            registerAllocation[registerID] = attentionMask[address]
          }
        }
      default:
        fatalError("This should never happen.")
      }

      // Materialize a block of the attention matrix.

      // ...

      // Add the register allocation to the temporarily materialized block of
      // the attention matrix.

      // ...
    }
  }
}

// MFA 2 (fixes the bottlenecks in CPU latency and GPU memory allocation)
func generateBlockMask2() {
  // Expose the block size to the CPU.
  let rowBlockSize: Int = 32
  let columnBlockSize: Int = 56 // Query the best block size from the NNC backend.

  // 1_000 = 32_000 / 32
  //   571 = 32_000 / 56
  var blockCodes = [UInt8](repeating: .zero, count: 1_000 * 571)
  var blockAddresses = [Int32](repeating: -1, count: 1_000 * 571)
  var attentionMask: [Float16] = []

  // On the CPU, s4nnc side.
  // - Create the attention mask in CPU code, with an s4nnc API for
  //   programmable blockmasks.
  // - About 2,500,000 clock cycles.
  // - One millisecond of latency.
  //
  // SERIAL FOR LOOP
  //
  for rowBlockID in 0..<1_000 {
    for columnBlockID in 0..<571 {
      let rowStart = rowBlockID * rowBlockSize
      let rowEnd = (rowBlockID + 1) * rowBlockSize
      let columnStart = columnBlockID * columnBlockSize
      let columnEnd = (columnBlockID + 1) * columnBlockSize

      // This block of code is invoked 571,000 times.
      do {
        let blockID = rowBlockID * 571 + columnBlockID
        if rowEnd < columnStart {
          // This part of the code is reached in 49.9% of the iterations.
          blockCodes[blockID] = 1
          continue
        }
        if rowStart > columnEnd {
          // This part of the code is reached in 49.9% of the iterations.
          blockCodes[blockID] = 0
          continue
        }

        // This part of the code is only reached in 0.2% of the iterations.
        blockCodes[blockID] = 2
        blockAddresses[blockID] = Int32(attentionMask.count)
      }

      // This block of code is invoked 1,142 times.
      for rowID in rowStart..<rowEnd {
        for columnID in columnStart..<columnEnd {
          // This block of code is invoked 2 million times.
          do {
            var maskValue: Float16
            if rowID <= columnID {
              maskValue = 0
            } else {
              maskValue = -.infinity
            }
            let address = rowID * 32_000 + columnID
            attentionMask.append(maskValue)
          }
        }
      }
    }
  }

  // 2 megabytes of memory
  var blockMask = [UInt32](repeating: .zero, count: 1_000 * 571)

  // On the CPU, in the libNNC MFA backend.
  // - Execute on the CPU instead of the GPU.
  //   - Now, the MFA backend doesn't need to code an entire GPU kernel just
  //     to overcome an implementation issue.
  // - 500,000 clock cycles.
  // - 200 microseconds of latency.
  //   - One-time cost (we don't do this every time an attention kernel
  //     is dispatched, just when the attention mask is first created).
  //
  // SERIAL FOR LOOP
  //
  for rowBlockID in 0..<1_000 {
    for columnBlockID in 0..<571 {
      let blockID = rowBlockID * 571 + columnBlockID
      let code = blockCodes[blockID]

      // Should be -1 if no memory was allocated for this block.
      var address = blockAddresses[blockID]

      // We know the addresses within the array will be divisible by 32 * 56.
      // If we divide the address by 32 * 56, we can store the same amount of
      // information with less bits. Enough to compress the 32-bit integer to
      // 24-bit.
      address /= 32 * 56

      let maskValue = UInt32(code) + UInt32(address &<< 8)
      blockMask[blockID] = maskValue
    }
  }

  // Attention kernel on the GPU.
  //
  // Dispatch 32,000 / 32 threadgroups.
  // Each threadgroup runs 32,000 / (32 to 72) iterations.
  //
  // PARALLEL FOR LOOP
  //
  for groupID in 0..<(32_000 / rowBlockSize) {
    var registerAllocation = [Float16](repeating: .zero, count: 32 * 56)

    for loopIterationID in 0..<(32_000 / columnBlockSize) {
      let blockMaskAddress = groupID * columnBlockSize + loopIterationID
      let blockMaskValue = blockMask[blockMaskAddress]

      switch UInt8(truncatingIfNeeded: blockMaskValue) {
      case 0:
        // This part of the code is reached in 49.9% of the iterations.
        continue // skip forward to next iteration
      case 1:
        // This part of the code is reached in 49.9% of the iterations.
        break // finish iteration
      case 2:
        // This part of the code is reached in 0.2% of the iterations.
        let rowStart = groupID * rowBlockSize
        let rowEnd = (groupID + 1) * rowBlockSize
        let columnStart = loopIterationID * columnBlockSize
        let columnEnd = (loopIterationID + 1) * columnBlockSize

        // Run in parallel over threads.
        for rowID in rowStart..<rowEnd {
          for columnID in columnStart..<columnEnd {
            // ============================================================== //
            // THIS IS THE PART OF THE MFA2 CODEBASE THAT HAS CHANGED.
            // (the other part being deletion of the generate_blockmask kernel)
            // ============================================================== //
            var address = blockMaskValue &>> 8
            address *= 32 * 56

            let registerID = (rowID - rowStart) * 56 + (columnID - columnStart)
            address += UInt32(registerID)
            registerAllocation[registerID] = attentionMask[Int(address)]
          }
        }
      default:
        fatalError("This should never happen.")
      }

      // Materialize a block of the attention matrix.

      // ...

      // Add the register allocation to the temporarily materialized block of
      // the attention matrix.

      // ...
    }
  }
}
	// MFA 1 (current implementation)
	func generateBlockMask1() {
	// 2 gigabytes of memory.
	// - This memory bottleneck predates MFA.
	// - Originated from the method of running naive attention in PyTorch:
	// allocate a statically shaped tensor of dimension [seqlen x seqlen]
	var attentionMask = [Float16](repeating: .zero, count: 32_000 * 32_000)

	// On the CPU, s4nnc side.
	// - Create the attention mask in CPU code, with an s4nnc API.
	// - About 1 billion clock cycles.
	// - Several hundred milliseconds of latency, likely has gone unnoticed
	// since the Draw Things app first launched (there was masked attention
	// in some part of SDv1?)
	// - This latency bottleneck predates MFA.
	//
	// SERIAL FOR LOOP
	//
	for rowID in 0..<32_000 {
	for columnID in 0..<32_000 {
	// This block of code is invoked 1 billion times.
	do {
	var maskValue: Float16
	if rowID <= columnID {
	maskValue = 0
	} else {
	maskValue = -.infinity
	}
	let address = rowID * 32_000 + columnID
	attentionMask[address] = maskValue
	}
	}
	}

	// 1 megabyte of memory.
	let rowBlockSize = 32
	let columnBlockSize = [32, 40, 56, 72].randomElement()!
	var blockMask = [UInt8](
	repeating: .zero,
	count: (32_000 / rowBlockSize) * (32_000 / columnBlockSize))

	// On the GPU, in the libNNC MFA backend.
	// - Copy the blockmask to the compressed form.
	// - Executes very fast, because the loop iterations are parallelized.
	// - Primary reason for running on the GPU, is not performance, but actually
	// ease of implementation. The libNNC MFA backend needs to match the block
	// size of this mask to the actual attention kernel block size (32x32,
	// 32x40, 32x56, or 32x72). There is no mechanism to elevate information
	// about GPU kernel block size all the way from the libNNC backend to the
	// s4nnc frontend API.
	//
	// - 200 microseconds of latency.
	// - The overhead of dispatching one GPU kernel for something with a
	// one-time cost is unjustified.
	// - The overhead is so little, we actually dispatch this GPU kernel
	// every single time attention is invoked! But we shouldn't have to.
	//
	// PARALLEL FOR LOOP
	//
	// Outer loop over threadgroups.
	// 0..<1000
	for rowBlockID in 0..<rowBlockSize {
	let rowStart = rowBlockID * rowBlockSize
	let rowEnd = (rowBlockID + 1) * rowBlockSize

	// Outer loop over threadgroups.
	// 0..<1000
	for columnBlockID in 0..<columnBlockSize {
	let columnStart = columnBlockID * columnBlockSize
	let columnEnd = (columnBlockID + 1) * columnBlockSize

	var hasInfinity = false
	var hasZero = false
	var hasOtherNumber = false

	// Inner loop over threads.
	// 0..<32, 32..<64 ... 31968..<32,000
	// 0..<32_000 (if you were to execute them serially)
	for rowID in rowStart..<rowEnd {
	// Inner loop over threads.
	// 0..<32, 32..<64 ... 31968..<32,000
	// 0..<32_000 (if you were to execute them serially)
	for columnID in columnStart..<columnEnd {
	// This block of code is invoked 1 billion times.
	do {
	let address = rowID * 32_000 + columnID
	let maskValue = attentionMask[address]

	if maskValue == -.infinity {
	hasInfinity = true
	} else if maskValue == 0 {
	hasZero = true
	} else {
	hasOtherNumber = true
	}
	}
	}
	}

	// Choose the mask value.
	var maskValue: UInt8
	switch (hasInfinity, hasZero, hasOtherNumber) {
	case (true, false, false):
	// The GPU will perform 1 computation for this block, which is to
	// read the mask value and discover it's zero.
	maskValue = 0
	case (false, true, false):
	// The GPU will perform 32 * 32 * D computations for this block.
	maskValue = 1
	default:
	// Some of the attention matrix elements need to be computed. The GPU
	// will read from a small sub-region of the FULL ATTENTION MASK, a
	// memory allocation spanning 1 billion elements.
	maskValue = 2
	}
	let blockID = rowBlockID * 571 + columnBlockID
	blockMask[blockID] = maskValue
	}
	}

	// Attention kernel on the GPU.
	//
	// Dispatch 32,000 / 32 threadgroups.
	// Each threadgroup runs 32,000 / (32 to 72) iterations.
	for groupID in 0..<(32_000 / rowBlockSize) {
	var registerAllocation = [Float16](repeating: .zero, count: 32 * 56)

	for loopIterationID in 0..<(32_000 / columnBlockSize) {
	let blockMaskAddress = groupID * columnBlockSize + loopIterationID
	let blockMaskValue = blockMask[blockMaskAddress]

	switch blockMaskValue {
	case 0:
	// This part of the code is reached in 49.9% of the iterations.
	continue // skip forward to next iteration
	case 1:
	// This part of the code is reached in 49.9% of the iterations.
	break // finish iteration
	case 2:
	// This part of the code is reached in 0.2% of the iterations.
	let rowStart = groupID * rowBlockSize
	let rowEnd = (groupID + 1) * rowBlockSize
	let columnStart = loopIterationID * columnBlockSize
	let columnEnd = (loopIterationID + 1) * columnBlockSize

	// Run in parallel over threads.
	for rowID in rowStart..<rowEnd {
	for columnID in columnStart..<columnEnd {
	let address = rowID * 32_000 + columnID
	let registerID = (rowID - rowStart) * 56 + (columnID - columnStart)
	registerAllocation[registerID] = attentionMask[address]
	}
	}
	default:
	fatalError("This should never happen.")
	}

	// Materialize a block of the attention matrix.

	// ...

	// Add the register allocation to the temporarily materialized block of
	// the attention matrix.

	// ...
	}
	}
	}

	// MFA 2 (fixes the bottlenecks in CPU latency and GPU memory allocation)
	func generateBlockMask2() {
	// Expose the block size to the CPU.
	let rowBlockSize: Int = 32
	let columnBlockSize: Int = 56 // Query the best block size from the NNC backend.

	// 1_000 = 32_000 / 32
	// 571 = 32_000 / 56
	var blockCodes = [UInt8](repeating: .zero, count: 1_000 * 571)
	var blockAddresses = [Int32](repeating: -1, count: 1_000 * 571)
	var attentionMask: [Float16] = []

	// On the CPU, s4nnc side.
	// - Create the attention mask in CPU code, with an s4nnc API for
	// programmable blockmasks.
	// - About 2,500,000 clock cycles.
	// - One millisecond of latency.
	//
	// SERIAL FOR LOOP
	//
	for rowBlockID in 0..<1_000 {
	for columnBlockID in 0..<571 {
	let rowStart = rowBlockID * rowBlockSize
	let rowEnd = (rowBlockID + 1) * rowBlockSize
	let columnStart = columnBlockID * columnBlockSize
	let columnEnd = (columnBlockID + 1) * columnBlockSize

	// This block of code is invoked 571,000 times.
	do {
	let blockID = rowBlockID * 571 + columnBlockID
	if rowEnd < columnStart {
	// This part of the code is reached in 49.9% of the iterations.
	blockCodes[blockID] = 1
	continue
	}
	if rowStart > columnEnd {
	// This part of the code is reached in 49.9% of the iterations.
	blockCodes[blockID] = 0
	continue
	}

	// This part of the code is only reached in 0.2% of the iterations.
	blockCodes[blockID] = 2
	blockAddresses[blockID] = Int32(attentionMask.count)
	}

	// This block of code is invoked 1,142 times.
	for rowID in rowStart..<rowEnd {
	for columnID in columnStart..<columnEnd {
	// This block of code is invoked 2 million times.
	do {
	var maskValue: Float16
	if rowID <= columnID {
	maskValue = 0
	} else {
	maskValue = -.infinity
	}
	let address = rowID * 32_000 + columnID
	attentionMask.append(maskValue)
	}
	}
	}
	}
	}

	// 2 megabytes of memory
	var blockMask = [UInt32](repeating: .zero, count: 1_000 * 571)

	// On the CPU, in the libNNC MFA backend.
	// - Execute on the CPU instead of the GPU.
	// - Now, the MFA backend doesn't need to code an entire GPU kernel just
	// to overcome an implementation issue.
	// - 500,000 clock cycles.
	// - 200 microseconds of latency.
	// - One-time cost (we don't do this every time an attention kernel
	// is dispatched, just when the attention mask is first created).
	//
	// SERIAL FOR LOOP
	//
	for rowBlockID in 0..<1_000 {
	for columnBlockID in 0..<571 {
	let blockID = rowBlockID * 571 + columnBlockID
	let code = blockCodes[blockID]

	// Should be -1 if no memory was allocated for this block.
	var address = blockAddresses[blockID]

	// We know the addresses within the array will be divisible by 32 * 56.
	// If we divide the address by 32 * 56, we can store the same amount of
	// information with less bits. Enough to compress the 32-bit integer to
	// 24-bit.
	address /= 32 * 56

	let maskValue = UInt32(code) + UInt32(address &<< 8)
	blockMask[blockID] = maskValue
	}
	}

	// Attention kernel on the GPU.
	//
	// Dispatch 32,000 / 32 threadgroups.
	// Each threadgroup runs 32,000 / (32 to 72) iterations.
	//
	// PARALLEL FOR LOOP
	//
	for groupID in 0..<(32_000 / rowBlockSize) {
	var registerAllocation = [Float16](repeating: .zero, count: 32 * 56)

	for loopIterationID in 0..<(32_000 / columnBlockSize) {
	let blockMaskAddress = groupID * columnBlockSize + loopIterationID
	let blockMaskValue = blockMask[blockMaskAddress]

	switch UInt8(truncatingIfNeeded: blockMaskValue) {
	case 0:
	// This part of the code is reached in 49.9% of the iterations.
	continue // skip forward to next iteration
	case 1:
	// This part of the code is reached in 49.9% of the iterations.
	break // finish iteration
	case 2:
	// This part of the code is reached in 0.2% of the iterations.
	let rowStart = groupID * rowBlockSize
	let rowEnd = (groupID + 1) * rowBlockSize
	let columnStart = loopIterationID * columnBlockSize
	let columnEnd = (loopIterationID + 1) * columnBlockSize

	// Run in parallel over threads.
	for rowID in rowStart..<rowEnd {
	for columnID in columnStart..<columnEnd {
	// ============================================================== //
	// THIS IS THE PART OF THE MFA2 CODEBASE THAT HAS CHANGED.
	// (the other part being deletion of the generate_blockmask kernel)
	// ============================================================== //
	var address = blockMaskValue &>> 8
	address = 32 56

	let registerID = (rowID - rowStart) * 56 + (columnID - columnStart)
	address += UInt32(registerID)
	registerAllocation[registerID] = attentionMask[Int(address)]
	}
	}
	default:
	fatalError("This should never happen.")
	}

	// Materialize a block of the attention matrix.

	// ...

	// Add the register allocation to the temporarily materialized block of
	// the attention matrix.

	// ...
	}
	}
	}