Skip to content

Instantly share code, notes, and snippets.

View dfyz's full-sized avatar

Ivan Komarov dfyz

View GitHub Profile
@dfyz
dfyz / main.cu
Last active June 27, 2024 20:09
// nvcc -O2 -std=c++17 -gencode=arch=compute_80,code=sm_80 -I .../cutlass/include -I .../cutlass/tools/util/include --expt-relaxed-constexpr -lcublas -o main main.cu
#include <cutlass/gemm/device/default_gemm_configuration.h>
#include <cutlass/layout/matrix.h>
#include <cutlass/numeric_types.h>
#include <cutlass/gemm/device/gemm.h>
#include <cutlass/gemm/device/gemm_grouped.h>
#include <cutlass/gemm/kernel/gemm_grouped.h>
#include <cutlass/gemm/kernel/default_gemm_grouped.h>
#include <cutlass/util/host_tensor.h>
@dfyz
dfyz / pytorch.patch
Created May 21, 2024 20:51
An hackish example of integrating NCCL kernel-level profiling into PyTorch
diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
index c68eb18099..f7038051d3 100644
--- a/torch/csrc/autograd/profiler_kineto.cpp
+++ b/torch/csrc/autograd/profiler_kineto.cpp
@@ -31,6 +31,9 @@
#ifdef USE_KINETO
#include <libkineto.h>
#include <time_since_epoch.h>
+#include <dlfcn.h>
+
>src\forktest.com --ftrace
FUN 2556 12684 5'596'885 440 &__nocolor_init
FUN 2556 12684 5'810'695 456 &getenv
FUN 2556 12684 5'934'173 440 &winclock_init
FUN 2556 12684 6'158'823 456 &QueryPerformanceCounter
FUN 2556 12684 6'378'783 456 &QueryPerformanceFrequency
FUN 2556 12684 6'981'566 440 &outinit
FUN 2556 12684 7'126'687 456 &__fflush_register
FUN 2556 12684 7'245'432 504 &pthread_mutex_lock
FUN 2556 12684 7'351'612 552 &nsync_mu_lock
@dfyz
dfyz / linux_aarch64.png
Last active February 25, 2024 14:35
TeX goes cross-platform
linux_aarch64.png
@dfyz
dfyz / bad.txt
Created February 9, 2024 03:18
Busytex AR shenanigans
native/texlive/libs/potrace/Makefile:AR = ar
native/texlive/libs/gmp/Makefile:AR = ar
native/texlive/libs/libpng/Makefile:AR = ar
native/texlive/libs/pixman/Makefile:AR = ar
native/texlive/libs/harfbuzz/Makefile:AR = ar
native/texlive/libs/zziplib/Makefile:AR = ar
native/texlive/libs/teckit/Makefile:AR = ar
native/texlive/libs/gd/Makefile:AR = ar
native/texlive/libs/cairo/Makefile:AR = ar
native/texlive/libs/zlib/Makefile:AR = ar
from pwn import *
import string
PROMPT = b'> '
def main():
with remote('blackout.seccon.games', 9999) as tube:
@dfyz
dfyz / brute.cpp
Created September 13, 2023 23:10
/***
* Instructions:
*
* Just code everything in C, and add __device__ before every memory and func
* that should be accessible on GPU :)
*
* Test on GCC without GPU, just forks:
* gcc -x c -O3 -march=native -o brut brut.cu ; time ./brut 147db93f3e45f64:952781782a15ae6e
*
* CUDA prod:
from pwn import *
# padding; push 0; ret
TARGET = int.from_bytes(b'\x00\x00\x6a\x00\xc3', byteorder='little')
PROMPT = b': '
SHELLCODE = r'''
xor eax, eax
xor rdx, rdx
xor rsi, rsi
@dfyz
dfyz / RB.litmus
Last active May 7, 2023 18:51
Modeling incorrect SPSC ring buffer
AArch64 RB.correct
"
Theoretical results:
States 2
1:X6=0; 1:X9=61;
1:X6=1; 1:X9=41;
"
{
0:X3=x; 0:X4=y; 0:X5=z;
1:X3=x; 1:X4=y; 1:X5=z;