fxmarty

## scale_mm_example.py
import torch
import torch.nn.functional as F

def to_float8(x, dtype=torch.float8_e4m3fn):
    finfo = torch.finfo(dtype)
    # Calculate the scale as dtype max divided by absmax
    scale = finfo.max / x.abs().max().clamp(min=1e-12)
    # scale and clamp the tensor to bring it to
    # the representative range of float8 data type
    # (as default cast is unsaturated)

## part_1_memory_format_and_channels_last_optimization.md

      
              1 file
            
          
              3 forks
            
          
              0 comments
            
          
              16 stars
            
          
                mingfeima
                / part_1_memory_format_and_channels_last_optimization.md
            
            
              Last active
              June 28, 2024 11:03
            
              
                PyTorch CPU Performance Optimization Tutorial - Section I
              
          
    Part I: Memory Formats and Channels Last Optimization

(Training material on pytorch CPU performance optimization)

Part II: Parallelization Techniques
Part III: Vectorization Techniques
Part IV: BFloat16 Kernel Optimization

Chinese version for this chapter, link.

  
## nsight.sh
# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

# https://developer.nvidia.com/nsight-systems
# https://docs.nvidia.com/nsight-systems/profiling/index.html

# My preferred nsys (command line executable used to create profiles) commands
#
# In your script, write
# torch.cuda.nvtx.range_push("region name")
# ...

## aws_ec2_ubuntu_userdata_docker.sh
#!/bin/bash
# Install docker
apt-get update
apt-get install -y cloud-utils apt-transport-https ca-certificates curl software-properties-common
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
add-apt-repository \
   "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
   $(lsb_release -cs) \
   stable"
apt-get update

## add.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

int addi(int a, int b) {
    return a + b;
}

char *adds(char *a, char *b) {
    char *res = malloc(strlen(a) + strlen(b) + 1);
	import torch
	import torch.nn.functional as F

	def to_float8(x, dtype=torch.float8_e4m3fn):
	finfo = torch.finfo(dtype)
	# Calculate the scale as dtype max divided by absmax
	scale = finfo.max / x.abs().max().clamp(min=1e-12)
	# scale and clamp the tensor to bring it to
	# the representative range of float8 data type
	# (as default cast is unsaturated)
	# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.

	# https://developer.nvidia.com/nsight-systems
	# https://docs.nvidia.com/nsight-systems/profiling/index.html

	# My preferred nsys (command line executable used to create profiles) commands
	#
	# In your script, write
	# torch.cuda.nvtx.range_push("region name")
	# ...
	#!/bin/bash
	# Install docker
	apt-get update
	apt-get install -y cloud-utils apt-transport-https ca-certificates curl software-properties-common
	curl -fsSL https://download.docker.com/linux/ubuntu/gpg \| sudo apt-key add -
	add-apt-repository \
	"deb [arch=amd64] https://download.docker.com/linux/ubuntu \
	$(lsb_release -cs) \
	stable"
	apt-get update
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>

	int addi(int a, int b) {
	return a + b;
	}

	char adds(char a, char *b) {
	char *res = malloc(strlen(a) + strlen(b) + 1);