Adam Hill 🦿 adamhill

## mlx_distributed_deepseek.md

      
              1 file
            
          
              14 forks
            
          
                41 comments
              
            
              94 stars
            
          
                awni
                / mlx_distributed_deepseek.md
            
            
              Last active
              December 9, 2025 03:54
            
              
                Run DeepSeek R1 or V3 with MLX Distributed
              
          
    Setup

On every machine in the cluster install openmpi and mlx-lm:
conda install conda-forge::openmpi
pip install -U mlx-lm
Next download the pipeline parallel run script. Download it to the same path on every machine:

  
## power-usage.py
# Power Usage Calculator for AI Workloads

'''
# Serving
$ vllm serve meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size 4 --num-scheduler-steps 20 --quantization=fp8 --gpu-memory-utilization=0.97
INFO 01-13 04:59:05 api_server.py:712] vLLM API server version 0.6.6.post2.dev5+g5ce4627a

# Benchmark - we do bs=64 to emulate https://arxiv.org/pdf/2310.03003
    cmd = [
        "python", os.path.expanduser("~/vllm/benchmarks/benchmark_serving.py"),

## Array+concurrentMap.swift
// MARK: Transform an array of items
extension Array where Element: Sendable {
    /// Execute a throwing task for each element of the array.
    ///
    /// - All tasks are executed concurrently,
    /// - If the transformer is `nonisolated` all tasks are executed in parallel .
    /// - The resulting array maintains the same order as the original array.
    /// - If any task throws any error, all tasks are allowed to complete. The resulting error includes a array of
    ///   results where successful elements can be used and failed elements can be handled.
    ///

## run_in_xcode.scpt
tell application "System Events"
	if (name of processes) contains "Xcode" then
		-- Check if Xcode is already the active/focused application
		if (name of first application process whose frontmost is true) is not "Xcode" then
			tell application "Xcode 16.2 (Beta)"
				activate

				delay 0.25 -- Wait for Xcode to become active
			end tell
		end if

## BreathingLeavesView.swift
import RealityKit
import SwiftUI

struct BreathingLeavesView: View {
    let rootEntity = Entity()
    @State var children = [EntityPositionPair]()
    @State private var rotationAngles: SIMD3<Float> = [0, 0, 0]
    @State private var modulationTimer: Timer?
    @State private var time: Double = 0.0
    @State private var lastRotationUpdateTime = CACurrentMediaTime()

## FlamesTextAnimationView.swift
import SwiftUI
import RealityKit

struct FlamesTextAnimationView: View {
    var textLines: [String] = ["WELCOME", "TO", "APP NAME"]
    let commandQueue: MTLCommandQueue
    let computePipeline: MTLComputePipelineState
    @State private var texture: LowLevelTexture?
    let timer = Timer.publish(every: 1.0 / 120.0, on: .main, in: .common).autoconnect()
    @State private var time: Float = 0

## gist:c49b264823be8f1013c258ae991bb4a2
VisionOS notes


Q: Curious what tools or workflows designers are using to mock up things like volumes or immersive spaces considering things like Sketch and Figma are 2D or "window" based?

A: Great question! We find a lot of people start with our visionOS Apple Design Resources especially on Figma (https://www.figma.com/community/file/1253443272911187215) and animate flat views to sell the ideas to partners. Then we've seen folks move to tools like Spline (https://spline.design), Blender, etc. for 3D workflows. In particular, Spline has a visionOS mirror app (https://docs.spline.design/doc/spline-mirror-for-visionos/docaQJC8SwTF) that makes it much more efficient to prototype 3d environments!

Q:  I'm generating mesh data in code and then programmatically creating objects in a volume. As the app progresses, I need to switch out some of those meshes. If this is happening fairly rapidly (multiple times per second), would it generally be better to keep a reference to the entity to be able to update it, or sho

## audio.ni
printf = dlimport 'printf
printf-type = fntype auto s32 ptr
printf_ = vatype printf-type
printf_ptr = vatype printf-type ptr
printf_s32 = vatype printf-type s32
printf_u32 = vatype printf-type u32
printf_u64 = vatype printf-type u64
printf_s32_s32 = vatype printf-type s32 s32
printf_s32_s32_s32 = vatype printf-type s32 s32 s32
printf_u64_s32 = vatype printf-type u64 s32

## highlight_thousands.cljs
;; Joyride thousands highlighter
;; The end goal here is to help humans read long numbers by highlighting groups of thousands
;; First we need to find the groups of thousands. We only want to highlight _odd_ groups
;; of thousands, starting with the least significant group to the most significant group.
;; We also consider the most significant group of thousands when it is not three digits long.

; Here is some test data, line numbers are on the left
; The xxx are the groups of thousands we want to highlight on the numbers on the line above them.
;08  1111111222111333444 :foo 555666 :bar 123 :baz 1234
;09  x   xxx   xxx   xxx         xxx                xxx

## 12ftio.js
javascript:(function()%7Bwindow.location.href%3D'https%3A%2F%2F12ft.io%2F'%2Bwindow.location.href%3B%7D)()
	# Power Usage Calculator for AI Workloads

	'''
	# Serving
	$ vllm serve meta-llama/Llama-3.3-70B-Instruct --tensor-parallel-size 4 --num-scheduler-steps 20 --quantization=fp8 --gpu-memory-utilization=0.97
	INFO 01-13 04:59:05 api_server.py:712] vLLM API server version 0.6.6.post2.dev5+g5ce4627a

	# Benchmark - we do bs=64 to emulate https://arxiv.org/pdf/2310.03003
	cmd = [
	"python", os.path.expanduser("~/vllm/benchmarks/benchmark_serving.py"),
	// MARK: Transform an array of items
	extension Array where Element: Sendable {
	/// Execute a throwing task for each element of the array.
	///
	/// - All tasks are executed concurrently,
	/// - If the transformer is `nonisolated` all tasks are executed in parallel .
	/// - The resulting array maintains the same order as the original array.
	/// - If any task throws any error, all tasks are allowed to complete. The resulting error includes a array of
	/// results where successful elements can be used and failed elements can be handled.
	///
	tell application "System Events"
	if (name of processes) contains "Xcode" then
	-- Check if Xcode is already the active/focused application
	if (name of first application process whose frontmost is true) is not "Xcode" then
	tell application "Xcode 16.2 (Beta)"
	activate

	delay 0.25 -- Wait for Xcode to become active
	end tell
	end if
	import RealityKit
	import SwiftUI

	struct BreathingLeavesView: View {
	let rootEntity = Entity()
	@State var children = [EntityPositionPair]()
	@State private var rotationAngles: SIMD3<Float> = [0, 0, 0]
	@State private var modulationTimer: Timer?
	@State private var time: Double = 0.0
	@State private var lastRotationUpdateTime = CACurrentMediaTime()
	import SwiftUI
	import RealityKit

	struct FlamesTextAnimationView: View {
	var textLines: [String] = ["WELCOME", "TO", "APP NAME"]
	let commandQueue: MTLCommandQueue
	let computePipeline: MTLComputePipelineState
	@State private var texture: LowLevelTexture?
	let timer = Timer.publish(every: 1.0 / 120.0, on: .main, in: .common).autoconnect()
	@State private var time: Float = 0
	VisionOS notes


	Q: Curious what tools or workflows designers are using to mock up things like volumes or immersive spaces considering things like Sketch and Figma are 2D or "window" based?

	A: Great question! We find a lot of people start with our visionOS Apple Design Resources especially on Figma (https://www.figma.com/community/file/1253443272911187215) and animate flat views to sell the ideas to partners. Then we've seen folks move to tools like Spline (https://spline.design), Blender, etc. for 3D workflows. In particular, Spline has a visionOS mirror app (https://docs.spline.design/doc/spline-mirror-for-visionos/docaQJC8SwTF) that makes it much more efficient to prototype 3d environments!

	Q: I'm generating mesh data in code and then programmatically creating objects in a volume. As the app progresses, I need to switch out some of those meshes. If this is happening fairly rapidly (multiple times per second), would it generally be better to keep a reference to the entity to be able to update it, or sho
	printf = dlimport 'printf
	printf-type = fntype auto s32 ptr
	printf_ = vatype printf-type
	printf_ptr = vatype printf-type ptr
	printf_s32 = vatype printf-type s32
	printf_u32 = vatype printf-type u32
	printf_u64 = vatype printf-type u64
	printf_s32_s32 = vatype printf-type s32 s32
	printf_s32_s32_s32 = vatype printf-type s32 s32 s32
	printf_u64_s32 = vatype printf-type u64 s32
	;; Joyride thousands highlighter
	;; The end goal here is to help humans read long numbers by highlighting groups of thousands
	;; First we need to find the groups of thousands. We only want to highlight _odd_ groups
	;; of thousands, starting with the least significant group to the most significant group.
	;; We also consider the most significant group of thousands when it is not three digits long.

	; Here is some test data, line numbers are on the left
	; The xxx are the groups of thousands we want to highlight on the numbers on the line above them.
	;08 1111111222111333444 :foo 555666 :bar 123 :baz 1234
	;09 x xxx xxx xxx xxx xxx