Skip to content

Instantly share code, notes, and snippets.

View chengscott's full-sized avatar

Scott Cheng chengscott

View GitHub Profile
@chengscott
chengscott / README.md
Created April 22, 2024 00:50
Run different jobs at specific time points on weekdays using a systemd template unit
systemctl daemon-reload
systemctl enable --now job@{08:00,12:00,21:00}.timer
#include <iostream>
#include <memory>
#include <mutex>
#include <thread>
class MeanTracker {
int total_ = 0;
float mean_ = 0.f;
public:
#include <iostream>
__device__ int warpInclusiveScan(int val) {
int laneId = threadIdx.x % warpSize;
for (int offset = 1; offset < 32; offset <<= 1) {
int v = __shfl_up_sync(0xffffffff, val, offset);
if (laneId >= offset) val += v;
}
return val;
}
diff --git a/python/aitemplate/backend/cuda/conv2d/common.py b/python/aitemplate/backend/cuda/conv2d/common.py
index 8cf7fb2..ca13a72 100644
--- a/python/aitemplate/backend/cuda/conv2d/common.py
+++ b/python/aitemplate/backend/cuda/conv2d/common.py
@@ -501,6 +501,7 @@ def emit_instance(op):
emiter = cutlass_lib.conv2d_operation.EmitConv2dWithBroadcastInstance()
else:
emiter = cutlass_lib.conv2d_operation.EmitConv2dInstance()
+ op.tile_description.stages = 2
op_def = emiter.emit(op)
static size_t GLOBAL_WORKSPACE_SIZE_DeviceConvFwdInstance_0 = 0;
#include <cstdio>
#include <stdexcept>
#include "cutlass/cutlass.h"
#include "cutlass/conv/kernel/default_conv2d_fprop.h"
@chengscott
chengscott / PKGBUILD
Created October 21, 2023 04:17
python-apex-git
# Maintainer: Leo Mao <leomaoyw at gmail dot com>
pkgname=python-apex-git
_pkgname=apex
pkgver=23.08
pkgrel=1
pkgdesc="A PyTorch Extension: Tools for easy mixed precision and distributed training in Pytorch"
arch=('x86_64')
url="https://github.com/NVIDIA/apex"
_github='NVIDIA/apex'
diff --git a/megatron/arguments.py b/megatron/arguments.py
index 4bf1d72..bcea6ce 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -1090,7 +1090,7 @@ def _add_distributed_args(parser):
default=False, help='If set, use custom-built ring exchange '
'for p2p communications. Note that this option will require '
'a custom built image that support ring-exchange p2p.')
- group.add_argument('--local_rank', type=int, default=None,
+ group.add_argument('--local-rank', type=int, default=None,
@chengscott
chengscott / wrapper.py
Last active June 14, 2023 14:55
Cray MPI Wrapper
#!/usr/bin/env python
import os
import sys
import shlex
os.putenv('RANK', os.getenv('PMI_RANK'))
os.putenv('WORLD_SIZE', os.getenv('PMI_SIZE'))
os.putenv('LOCAL_RANK', os.getenv('PMI_LOCAL_RANK'))
os.putenv('LOCAL_SIZE', os.getenv('PMI_LOCAL_SIZE'))
argv = []
if os.getenv('PMI_RANK') == '0':
@chengscott
chengscott / cache.c
Last active January 26, 2023 22:49
sudo sysctl -w vm.nr_hugepages=256; gcc -O2 cache.c -o cache
#include <fcntl.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <time.h>
#include <unistd.h>
#define BASE_SIZE 128 // array base size = 1KB
#define MAX_SIZE 17 // array size ranges from 1KB to 64MB
[Unit]
Description=Reverse SSH connection
After=network.target
[Service]
Type=simple
ExecStart=/usr/bin/ssh -N -T -o "ServerAliveInterval 10" -o "ExitOnForwardFailure yes" -R 2222:localhost:22 chengscott@host
Restart=always
RestartSec=5s