Zhengyang Liu zhengyang92

## foo.ll
; https://alive2.llvm.org/ce/z/bKDTJd
; https://godbolt.org/z/v1qYrb5Ka
define float @src(float %0) {
entry:
  %1 = fmul float %0, 0.000000e+00
  %2 = fmul float %1, 3.000000e+00
  ret float %2
}

define float @tgt(float %0) {

## Makefile
SRCS = $(shell ls out_*.c)

OPT_PROGS = $(patsubst %.c,%.S,$(SRCS))

asm: $(OPT_PROGS)

CFLAGS = -O3 -march=native

%.S : %.c
        gcc $(CFLAGS) -S  -o $@ $<

## all_reduce.cu
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include<algorithm>
#include "mpi.h"
#include "accl_util.h"
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

## all_gather.cu
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include<algorithm>
#include "mpi.h"
#include "accl_util.h"
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

## simulation.py
import math
total_workload = 1024 # byte

buffersize = 256 # byte of shared memroy in system
dma_cost = 100 # loading one byte requires 100 unit of time
compute_cost = 100 #  computing one byte requires 100 unit of time


# 1 output tile requires 2 input tiles
for tilesize in range(1, 256):

## gist:ea315381f1cb67caf3a455b796478fbf
***directRecvCopySend (baseline)***
#       size         count    type     time   algbw   busbw  error     time   algbw   busbw  error
#        (B)    (elements)             (us)  (GB/s)  (GB/s)            (us)  (GB/s)  (GB/s)
        1024            32   float    38.90    0.03    0.02  0e+00    39.19    0.03    0.02  0e+00
        2048            64   float    39.33    0.05    0.05  0e+00    38.80    0.05    0.05  0e+00
        4096           128   float    39.25    0.10    0.09  0e+00    41.12    0.10    0.09  0e+00
        8192           256   float    39.28    0.21    0.18  0e+00    39.32    0.21    0.18  0e+00
       16384           512   float    40.68    0.40    0.35  0e+00    40.56    0.40    0.35  0e+00
       32768          1024   float    42.74    0.77    0.67  0e+00    42.36    0.77    0.68  0e+00
       65536          2048   float    45.60    1.44    1.26  0e+00    45.39    1.44    1.26  0e+00

## case.cpp
case 0: {
    for (int step = 0 ; step < NUM_STEPS; step ++) {
        size_t num_srcs = srcoffset_0.c15[step].length;
        if (threadIdx.x == 0) {
            bool l;
            do {
                l = false;
                for (size_t t = 0 ; t < num_srcs; t ++) {
                    l ||= (wait_0.b15[step].b[t] && (condvar[srcoffset_0.c15[step].c[t]].x != 1));
                }

## gist:f50de9c8c3218890ce55d80131c2d490
#define NUM_STEPS 15
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include "mpi.h"
#include "accl_util.h"
#include "cuda_runtime_api.h"
#define CUDACHECK(cmd) do {                         \

## allgather.cu
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include<algorithm>
#include "mpi.h"
#include "accl_util.h"
#include <cooperative_groups.h>
namespace cg = cooperative_groups;

## gist:b56a5e405c27027d21d8577743bad5e7
#define NUM_STEPS 15
#include<stdio.h>
#include<assert.h>
#include<stdlib.h>
#include<string.h>
#include<stdint.h>
#include "mpi.h"
#include "accl_util.h"
#include "cuda_runtime_api.h"
#define CUDACHECK(cmd) do {                         \
	; https://alive2.llvm.org/ce/z/bKDTJd
	; https://godbolt.org/z/v1qYrb5Ka
	define float @src(float %0) {
	entry:
	%1 = fmul float %0, 0.000000e+00
	%2 = fmul float %1, 3.000000e+00
	ret float %2
	}

	define float @tgt(float %0) {
	SRCS = $(shell ls out_*.c)

	OPT_PROGS = $(patsubst %.c,%.S,$(SRCS))

	asm: $(OPT_PROGS)

	CFLAGS = -O3 -march=native

	%.S : %.c
	gcc $(CFLAGS) -S -o $@ $<
	#include<stdio.h>
	#include<assert.h>
	#include<stdlib.h>
	#include<string.h>
	#include<stdint.h>
	#include<algorithm>
	#include "mpi.h"
	#include "accl_util.h"
	#include <cooperative_groups.h>
	namespace cg = cooperative_groups;
	import math
	total_workload = 1024 # byte

	buffersize = 256 # byte of shared memroy in system
	dma_cost = 100 # loading one byte requires 100 unit of time
	compute_cost = 100 # computing one byte requires 100 unit of time


	# 1 output tile requires 2 input tiles
	for tilesize in range(1, 256):
	*directRecvCopySend (baseline)*
	# size count type time algbw busbw error time algbw busbw error
	# (B) (elements) (us) (GB/s) (GB/s) (us) (GB/s) (GB/s)
	1024 32 float 38.90 0.03 0.02 0e+00 39.19 0.03 0.02 0e+00
	2048 64 float 39.33 0.05 0.05 0e+00 38.80 0.05 0.05 0e+00
	4096 128 float 39.25 0.10 0.09 0e+00 41.12 0.10 0.09 0e+00
	8192 256 float 39.28 0.21 0.18 0e+00 39.32 0.21 0.18 0e+00
	16384 512 float 40.68 0.40 0.35 0e+00 40.56 0.40 0.35 0e+00
	32768 1024 float 42.74 0.77 0.67 0e+00 42.36 0.77 0.68 0e+00
	65536 2048 float 45.60 1.44 1.26 0e+00 45.39 1.44 1.26 0e+00
	case 0: {
	for (int step = 0 ; step < NUM_STEPS; step ++) {
	size_t num_srcs = srcoffset_0.c15[step].length;
	if (threadIdx.x == 0) {
	bool l;
	do {
	l = false;
	for (size_t t = 0 ; t < num_srcs; t ++) {
	l \|\|= (wait_0.b15[step].b[t] && (condvar[srcoffset_0.c15[step].c[t]].x != 1));
	}
	#define NUM_STEPS 15
	#include<stdio.h>
	#include<assert.h>
	#include<stdlib.h>
	#include<string.h>
	#include<stdint.h>
	#include "mpi.h"
	#include "accl_util.h"
	#include "cuda_runtime_api.h"
	#define CUDACHECK(cmd) do { \