Mitchell Wortsman mitchellnw

## relu-attn-bf16.ipynb

      
              2 files
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                mitchellnw
                / relu-attn-bf16.ipynb
            
            
              Last active
              December 26, 2023 04:09
            
              
                triton-a100.ipynb
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## relu-attention-fp32.ipynb

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                mitchellnw
                / relu-attention-fp32.ipynb
            
            
              Last active
              December 25, 2023 20:50
            
              
                relu-attention-fp32.ipynb
              
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
      
    
## stableadamwunfused.py
import numpy as np
import torch

# This is the unfused version of StableAdamW. It is slower than the fused version (coming).

class StableAdamWUnfused(torch.optim.Optimizer):

    def __init__(self, params, lr=0.002, weight_decay=0.2, betas=(0.9, 0.99), eps=1e-6, clip_thresh=1., precision='amp_bfloat16', custom_scalar=65536):
        beta1, beta2 = betas[0], betas[1]
        defaults = dict(lr=lr, weight_decay=weight_decay, beta1=beta1, beta2=beta2)
	import numpy as np
	import torch

	# This is the unfused version of StableAdamW. It is slower than the fused version (coming).

	class StableAdamWUnfused(torch.optim.Optimizer):

	def __init__(self, params, lr=0.002, weight_decay=0.2, betas=(0.9, 0.99), eps=1e-6, clip_thresh=1., precision='amp_bfloat16', custom_scalar=65536):
	beta1, beta2 = betas[0], betas[1]
	defaults = dict(lr=lr, weight_decay=weight_decay, beta1=beta1, beta2=beta2)