Skip to content

Instantly share code, notes, and snippets.

View zhengyang92's full-sized avatar
🌚
traveling by telephone

Zhengyang Liu zhengyang92

🌚
traveling by telephone
View GitHub Profile
@zhengyang92
zhengyang92 / foo.ll
Last active February 27, 2024 18:32
; https://alive2.llvm.org/ce/z/bKDTJd
; https://godbolt.org/z/v1qYrb5Ka
define float @src(float %0) {
entry:
%1 = fmul float %0, 0.000000e+00
%2 = fmul float %1, 3.000000e+00
ret float %2
}
define float @tgt(float %0) {
@zhengyang92
zhengyang92 / Makefile
Created February 27, 2024 05:24
compile herbie dump
SRCS = $(shell ls out_*.c)
OPT_PROGS = $(patsubst %.c,%.S,$(SRCS))
asm: $(OPT_PROGS)
CFLAGS = -O3 -march=native
%.S : %.c
gcc $(CFLAGS) -S -o $@ $<
@zhengyang92
zhengyang92 / all_reduce.cu
Created September 27, 2023 19:58
all_reduce kernel, bandwidth-optimal, dgx-1
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include<algorithm>
#include "mpi.h"
#include "accl_util.h"
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
@zhengyang92
zhengyang92 / all_gather.cu
Created September 27, 2023 19:56
all_gather kernel, bandwidth-optimal, dgx-1
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include<algorithm>
#include "mpi.h"
#include "accl_util.h"
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
import math
total_workload = 1024 # byte
buffersize = 256 # byte of shared memroy in system
dma_cost = 100 # loading one byte requires 100 unit of time
compute_cost = 100 # computing one byte requires 100 unit of time
# 1 output tile requires 2 input tiles
for tilesize in range(1, 256):
***directRecvCopySend (baseline)***
# size count type time algbw busbw error time algbw busbw error
# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
1024 32 float 38.90 0.03 0.02 0e+00 39.19 0.03 0.02 0e+00
2048 64 float 39.33 0.05 0.05 0e+00 38.80 0.05 0.05 0e+00
4096 128 float 39.25 0.10 0.09 0e+00 41.12 0.10 0.09 0e+00
8192 256 float 39.28 0.21 0.18 0e+00 39.32 0.21 0.18 0e+00
16384 512 float 40.68 0.40 0.35 0e+00 40.56 0.40 0.35 0e+00
32768 1024 float 42.74 0.77 0.67 0e+00 42.36 0.77 0.68 0e+00
65536 2048 float 45.60 1.44 1.26 0e+00 45.39 1.44 1.26 0e+00
case 0: {
for (int step = 0 ; step < NUM_STEPS; step ++) {
size_t num_srcs = srcoffset_0.c15[step].length;
if (threadIdx.x == 0) {
bool l;
do {
l = false;
for (size_t t = 0 ; t < num_srcs; t ++) {
l ||= (wait_0.b15[step].b[t] && (condvar[srcoffset_0.c15[step].c[t]].x != 1));
}
#define NUM_STEPS 15
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include "mpi.h"
#include "accl_util.h"
#include "cuda_runtime_api.h"
#define CUDACHECK(cmd) do { \
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include<algorithm>
#include "mpi.h"
#include "accl_util.h"
#include <cooperative_groups.h>
namespace cg = cooperative_groups;
#define NUM_STEPS 15
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include "mpi.h"
#include "accl_util.h"
#include "cuda_runtime_api.h"
#define CUDACHECK(cmd) do { \