(Training material on pytorch CPU performance optimization)
- Part II: Parallelization Techniques
- Part III: Vectorization Techniques
- Part IV: BFloat16 Kernel Optimization
Chinese version for this chapter, link.
import torch | |
import torch.nn.functional as F | |
def to_float8(x, dtype=torch.float8_e4m3fn): | |
finfo = torch.finfo(dtype) | |
# Calculate the scale as dtype max divided by absmax | |
scale = finfo.max / x.abs().max().clamp(min=1e-12) | |
# scale and clamp the tensor to bring it to | |
# the representative range of float8 data type | |
# (as default cast is unsaturated) |
(Training material on pytorch CPU performance optimization)
Chinese version for this chapter, link.
# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting. | |
# https://developer.nvidia.com/nsight-systems | |
# https://docs.nvidia.com/nsight-systems/profiling/index.html | |
# My preferred nsys (command line executable used to create profiles) commands | |
# | |
# In your script, write | |
# torch.cuda.nvtx.range_push("region name") | |
# ... |
#!/bin/bash | |
# Install docker | |
apt-get update | |
apt-get install -y cloud-utils apt-transport-https ca-certificates curl software-properties-common | |
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - | |
add-apt-repository \ | |
"deb [arch=amd64] https://download.docker.com/linux/ubuntu \ | |
$(lsb_release -cs) \ | |
stable" | |
apt-get update |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
int addi(int a, int b) { | |
return a + b; | |
} | |
char *adds(char *a, char *b) { | |
char *res = malloc(strlen(a) + strlen(b) + 1); |