Several investigations were done into performance of matrix multiplication, kernel design, and differences between Apple GPU generations. This document serves to index the research, making it easier to locate old data / code.
Date: 5/24/24
// MFA 1 (current implementation) | |
func generateBlockMask1() { | |
// 2 gigabytes of memory. | |
// - This memory bottleneck predates MFA. | |
// - Originated from the method of running naive attention in PyTorch: | |
// allocate a statically shaped tensor of dimension [seqlen x seqlen] | |
var attentionMask = [Float16](repeating: .zero, count: 32_000 * 32_000) | |
// On the CPU, s4nnc side. | |
// - Create the attention mask in CPU code, with an s4nnc API. |
### M1 Max Statistics ### | |
0 - device store | |
1 - device store, with two-part load/store | |
2 - threadgroup store | |
3 - threadgroup store, with two-part load/store | |
FP32 (48x48x24) | |
problemSize = 976 | A B | 896 -> 896 -> 832 -> 832 | 7197 -> 7198 -> 7228 -> 7236 ( +1, +31, +39) GFLOPS |
High-resolution data for switching from direct (device) to async (threadgroup) stores. | |
M1 Max, FP32 | |
problemSize = 1488 | A B | 8004 -> 8001 (-3) GFLOPS | |
problemSize = 1489 | A B | 7538 -> 7585 (+47) GFLOPS | |
problemSize = 1490 | A B | 7603 -> 7637 (+34) GFLOPS | |
problemSize = 1491 | A B | 7608 -> 7655 (+47) GFLOPS | |
problemSize = 1492 | A B | 7679 -> 7700 (+21) GFLOPS | |
problemSize = 1493 | A B | 7653 -> 7683 (+30) GFLOPS |
import Foundation | |
import HDL | |
import MM4 | |
import Numerics | |
import OpenMM | |
func createGeometry() -> [Entity] { | |
// Create the compiled structure. | |
var hexagon = Hexagon() | |
hexagon.minimize() |
// | |
// main.swift | |
// UnifiedGEMMKernel | |
// | |
// Created by Philip Turner on 5/29/24. | |
// | |
import Metal | |
#if os(macOS) | |
import IOKit |
// | |
// main.swift | |
// M4LowPrecisionMath | |
// | |
// Created by Philip Turner on 5/28/24. | |
// | |
import Metal | |
// Investigating the performance of low- and mixed-precision computations after |
// | |
// main.swift | |
// HackingAIR | |
// | |
// Created by Philip Turner on 5/27/24. | |
// | |
import Metal | |
// Hacking AIR to open up the Apple GPU to general-purpose linear algebra. |
// | |
// Workspace.swift | |
// M4DeviceTesting | |
// | |
// Created by Philip Turner on 5/24/24. | |
// | |
import Metal | |
// Investigation of Float32 performance before and after dynamic caching on |
import Foundation | |
import HDL | |
import MM4 | |
import Numerics | |
import OpenMM | |
func createGeometry() -> [Entity] { | |
var hexagon = Hexagon() | |
hexagon.center() | |
return hexagon.topology.atoms |