Kyle Mabee kmabeeTT

## decode_graph_ttnn.mlir
#dram = #ttnn.buffer_type<dram>
#l1 = #ttnn.buffer_type<l1>
#loc = loc(unknown)
#loc67 = loc("p0.1")
#loc68 = loc("p1.5")
#loc69 = loc("p2.9")
#loc70 = loc("p3.14")
#loc71 = loc("p4.21")
#loc72 = loc("p5.27")
#loc73 = loc("p6.71")

## tt_xla_tp_llm_python_memory_leak.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kmabeeTT
                / tt_xla_tp_llm_python_memory_leak.md
            
            
              Created
              March 2, 2026 16:42
            
              
                TT-XLA: Python-Level Host Memory Leak After TP LLM Tests — Investigation & Workaround
              
          
    TT-XLA: Python-Level Host Memory Leak After TP LLM Tests

Date: 2026-03-01
Repo: tt-xla
PR: #3521

Problem


## claude-code-sound-hooks.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kmabeeTT
                / claude-code-sound-hooks.md
            
            
              Created
              February 12, 2026 20:38
            
          
    Claude Code Sound Notification Hooks

Audio alerts for when Claude finishes a task or needs your permission.
Setup

Add this hooks block to ~/.claude/settings.json:
{

  
## test_ttnn_sort_boundary.log
================================================================================
Test 1: Exact boundary (Wt in tiles)
================================================================================
  [PASS] Wt=  1: size=32, unique=32, dups=0, worst=0
  [PASS] Wt=  2: size=64, unique=64, dups=0, worst=0
  [PASS] Wt=  4: size=128, unique=128, dups=0, worst=0
  [PASS] Wt=  8: size=256, unique=256, dups=0, worst=0
  [FAIL] Wt=  9: size=288, unique=287, dups=1, worst=2
  [FAIL] Wt= 10: size=320, unique=319, dups=1, worst=2
  [FAIL] Wt= 11: size=352, unique=351, dups=1, worst=2

## opt125m_graph_17_ttir.mlir.txt
#loc1 = loc("p0.1")
#loc2 = loc("p1.37")
#loc3 = loc("p2.42")
#loc4 = loc("p3.82")
#loc5 = loc("p4.121")
#loc6 = loc("p5.150")
module @SyncTensorsGraph.296 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
  ttcore.device_module {
    builtin.module @SyncTensorsGraph.296 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
      func.func @main(%arg0: tensor<i64> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<i64>>} loc("p0.1"), %arg1: tensor<1xf32> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape =

## topk_graph_1_ttir.mlir.txt
#loc1 = loc("p0.1")
module @SyncTensorsGraph.25 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
  ttcore.device_module {
    builtin.module @SyncTensorsGraph.25 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
      func.func @main(%arg0: tensor<1x50000xf32> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<1x50000xf32>>, ttir.name = "args_0"} loc("p0.1")) -> (tensor<1x100xi64> {ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<1x100xi64>>}) {
        %0 = "ttir.reshape"(%arg0) <{shape = [1 : i32, 1 : i32, 50000 : i32]}> : (tensor<1x50000xf3

## ANALYSIS.md

      
              4 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kmabeeTT
                / ANALYSIS.md
            
            
              Created
              February 6, 2026 04:59
            
              
                ttnn.sort UINT16/UINT32 indices type mismatch - Padding threshold bug
              
          
    ttnn.sort UINT16/UINT32 Indices Type Mismatch

Summary


Test: topk with input size (1, 50000)
Graph: 1
Operation: ttnn.sort at line 9
Error: DataType mismatch, expected UINT16, got UINT32
Scope: Debug builds only (release builds pass but have latent bug)

Root Cause


## SIMPLE_REPRO.md

      
              3 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                kmabeeTT
                / SIMPLE_REPRO.md
            
            
              Created
              February 5, 2026 19:26
            
              
                opt125m ttnn_sort f32 type mismatch repro
              
          
    Simple Repro Commands

These commands reproduce the ttnn.sort f32 type mismatch failure.
Prerequisites

Ensure you have:

opt125m_graph_17_ttir.mlir.txt (already in this directory)
ttrt-artifacts/system_desc.ttsys (in parent directory)


## opt125m_graph_17_ttir.mlir.txt
#loc1 = loc("p0.1")
#loc2 = loc("p1.37")
#loc3 = loc("p2.42")
#loc4 = loc("p3.82")
#loc5 = loc("p4.121")
#loc6 = loc("p5.150")
module @SyncTensorsGraph.296 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
  ttcore.device_module {
    builtin.module @SyncTensorsGraph.296 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
      func.func @main(%arg0: tensor<i64> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<i64>>} loc("p0.1"), %arg1: tensor<1xf32> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape =

## topk_test_graph_1_ttir.mlir.txt
#loc1 = loc("p0.1")
module @SyncTensorsGraph.25 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
  ttcore.device_module {
    builtin.module @SyncTensorsGraph.25 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
      func.func @main(%arg0: tensor<1x10xf32> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<1x10xf32>>, ttir.name = "args_0"} loc("p0.1")) -> (tensor<1x5xi64> {ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<1x5xi64>>}) {
        %0 = "ttir.reshape"(%arg0) <{shape = [1 : i32, 1 : i32, 10 : i32]}> : (tensor<1x10xf32>) -> tensor<1x
	#dram = #ttnn.buffer_type<dram>
	#l1 = #ttnn.buffer_type<l1>
	#loc = loc(unknown)
	#loc67 = loc("p0.1")
	#loc68 = loc("p1.5")
	#loc69 = loc("p2.9")
	#loc70 = loc("p3.14")
	#loc71 = loc("p4.21")
	#loc72 = loc("p5.27")
	#loc73 = loc("p6.71")
	================================================================================
	Test 1: Exact boundary (Wt in tiles)
	================================================================================
	[PASS] Wt= 1: size=32, unique=32, dups=0, worst=0
	[PASS] Wt= 2: size=64, unique=64, dups=0, worst=0
	[PASS] Wt= 4: size=128, unique=128, dups=0, worst=0
	[PASS] Wt= 8: size=256, unique=256, dups=0, worst=0
	[FAIL] Wt= 9: size=288, unique=287, dups=1, worst=2
	[FAIL] Wt= 10: size=320, unique=319, dups=1, worst=2
	[FAIL] Wt= 11: size=352, unique=351, dups=1, worst=2
	#loc1 = loc("p0.1")
	#loc2 = loc("p1.37")
	#loc3 = loc("p2.42")
	#loc4 = loc("p3.82")
	#loc5 = loc("p4.121")
	#loc6 = loc("p5.150")
	module @SyncTensorsGraph.296 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
	ttcore.device_module {
	builtin.module @SyncTensorsGraph.296 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
	func.func @main(%arg0: tensor<i64> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<i64>>} loc("p0.1"), %arg1: tensor<1xf32> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape =
	#loc1 = loc("p0.1")
	module @SyncTensorsGraph.25 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
	ttcore.device_module {
	builtin.module @SyncTensorsGraph.25 attributes {mhlo.cross_program_prefetches = [], mhlo.input_output_alias = [], mhlo.is_dynamic = false, mhlo.use_auto_spmd_partitioning = false, ttcore.meshes = #ttcore.meshes<[<"mesh" = 1x1>]>} {
	func.func @main(%arg0: tensor<1x50000xf32> {ttcore.argument_type = #ttcore.argument_type<input>, ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<1x50000xf32>>, ttir.name = "args_0"} loc("p0.1")) -> (tensor<1x100xi64> {ttcore.runtime_tensor_sharding = #ttcore<runtime_tensor_sharding shard_status = <unsharded>, local_shape = tensor<1x100xi64>>}) {
	%0 = "ttir.reshape"(%arg0) <{shape = [1 : i32, 1 : i32, 50000 : i32]}> : (tensor<1x50000xf3