Philip Turner philipturner

## BlockSparseAttentionMask.swift
// MFA 1 (current implementation)
func generateBlockMask1() {
  // 2 gigabytes of memory.
  // - This memory bottleneck predates MFA.
  // - Originated from the method of running naive attention in PyTorch:
  //   allocate a statically shaped tensor of dimension [seqlen x seqlen]
  var attentionMask = [Float16](repeating: .zero, count: 32_000 * 32_000)

  // On the CPU, s4nnc side.
  // - Create the attention mask in CPU code, with an s4nnc API.

## BF16PerformanceData.txt
### M1 Max Statistics ###

0 - device store
1 - device store, with two-part load/store
2 - threadgroup store
3 - threadgroup store, with two-part load/store

FP32 (48x48x24)

problemSize =  976 | A   B   |  896 ->  896 ->  832 ->  832 | 7197 -> 7198 -> 7228 -> 7236 (  +1,  +31,  +39) GFLOPS

## AccumulatorStorePerformanceData.txt
High-resolution data for switching from direct (device) to async (threadgroup) stores.

M1 Max, FP32

problemSize = 1488 | A   B   | 8004 -> 8001 (-3) GFLOPS
problemSize = 1489 | A   B   | 7538 -> 7585 (+47) GFLOPS
problemSize = 1490 | A   B   | 7603 -> 7637 (+34) GFLOPS
problemSize = 1491 | A   B   | 7608 -> 7655 (+47) GFLOPS
problemSize = 1492 | A   B   | 7679 -> 7700 (+21) GFLOPS
problemSize = 1493 | A   B   | 7653 -> 7683 (+30) GFLOPS

## HexagonDraft3.swift
import Foundation
import HDL
import MM4
import Numerics
import OpenMM

func createGeometry() -> [Entity] {
  // Create the compiled structure.
  var hexagon = Hexagon()
  hexagon.minimize()

## MetalResearch.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                philipturner
                / MetalResearch.md
            
            
              Last active
              May 29, 2024 14:59
            
          
    Metal Research

Several investigations were done into performance of matrix multiplication, kernel design, and differences between Apple GPU generations. This document serves to index the research, making it easier to locate old data / code.
Archived Source Files

M4DeviceTesting.swift

Date: 5/24/24

  
## UnifiedGEMMKernel.swift
//
//  main.swift
//  UnifiedGEMMKernel
//
//  Created by Philip Turner on 5/29/24.
//

import Metal
#if os(macOS)
import IOKit

## M4LowPrecisionMath.swift
//
//  main.swift
//  M4LowPrecisionMath
//
//  Created by Philip Turner on 5/28/24.
//

import Metal

// Investigating the performance of low- and mixed-precision computations after

## HackingAIR.swift
//
//  main.swift
//  HackingAIR
//
//  Created by Philip Turner on 5/27/24.
//

import Metal

// Hacking AIR to open up the Apple GPU to general-purpose linear algebra.

## M4DeviceTesting.swift
//
//  Workspace.swift
//  M4DeviceTesting
//
//  Created by Philip Turner on 5/24/24.
//

import Metal

// Investigation of Float32 performance before and after dynamic caching on

## HexagonDraft2.swift
import Foundation
import HDL
import MM4
import Numerics
import OpenMM

func createGeometry() -> [Entity] {
  var hexagon = Hexagon()
  hexagon.center()
  return hexagon.topology.atoms
	// MFA 1 (current implementation)
	func generateBlockMask1() {
	// 2 gigabytes of memory.
	// - This memory bottleneck predates MFA.
	// - Originated from the method of running naive attention in PyTorch:
	// allocate a statically shaped tensor of dimension [seqlen x seqlen]
	var attentionMask = [Float16](repeating: .zero, count: 32_000 * 32_000)

	// On the CPU, s4nnc side.
	// - Create the attention mask in CPU code, with an s4nnc API.
	### M1 Max Statistics ###

	0 - device store
	1 - device store, with two-part load/store
	2 - threadgroup store
	3 - threadgroup store, with two-part load/store

	FP32 (48x48x24)

	problemSize = 976 \| A B \| 896 -> 896 -> 832 -> 832 \| 7197 -> 7198 -> 7228 -> 7236 ( +1, +31, +39) GFLOPS
	High-resolution data for switching from direct (device) to async (threadgroup) stores.

	M1 Max, FP32

	problemSize = 1488 \| A B \| 8004 -> 8001 (-3) GFLOPS
	problemSize = 1489 \| A B \| 7538 -> 7585 (+47) GFLOPS
	problemSize = 1490 \| A B \| 7603 -> 7637 (+34) GFLOPS
	problemSize = 1491 \| A B \| 7608 -> 7655 (+47) GFLOPS
	problemSize = 1492 \| A B \| 7679 -> 7700 (+21) GFLOPS
	problemSize = 1493 \| A B \| 7653 -> 7683 (+30) GFLOPS
	import Foundation
	import HDL
	import MM4
	import Numerics
	import OpenMM

	func createGeometry() -> [Entity] {
	// Create the compiled structure.
	var hexagon = Hexagon()
	hexagon.minimize()
	//
	// main.swift
	// UnifiedGEMMKernel
	//
	// Created by Philip Turner on 5/29/24.
	//

	import Metal
	#if os(macOS)
	import IOKit
	//
	// main.swift
	// M4LowPrecisionMath
	//
	// Created by Philip Turner on 5/28/24.
	//

	import Metal

	// Investigating the performance of low- and mixed-precision computations after
	//
	// main.swift
	// HackingAIR
	//
	// Created by Philip Turner on 5/27/24.
	//

	import Metal

	// Hacking AIR to open up the Apple GPU to general-purpose linear algebra.
	//
	// Workspace.swift
	// M4DeviceTesting
	//
	// Created by Philip Turner on 5/24/24.
	//

	import Metal

	// Investigation of Float32 performance before and after dynamic caching on