Skip to content

Instantly share code, notes, and snippets.

View cli99's full-sized avatar
🐼

Cheng Li cli99

🐼
View GitHub Profile

Problem

I have two Github accounts: oanhnn (personal) and superman (for work). I want to use both accounts on same computer (without typing password everytime, when doing git push or pull).

Solution

Use ssh keys and define host aliases in ssh config file (each alias for an account).

How to?

  1. Generate ssh key pairs for accounts and add them to GitHub accounts.
from vllm import LLM, SamplingParams
model_id = "/mnt/workdisk/chengli/models/llama3.1/llama-70b-instruct"
tensor_parallel_size = 4
llm = LLM(
model=model_id,
tensor_parallel_size=tensor_parallel_size,
)
prompts = [
device_map = {
"model.embed_tokens": "cpu",
"model.layers.0": "cpu",
"model.layers.1": "cpu",
"model.layers.2": "cpu",
"model.layers.3": "cpu",
"model.layers.4": 1,
"model.layers.5": 1,
"model.layers.6": 1,
"model.layers.7": 1,
# https://github.com/huggingface/transpip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/formers/pull/32047
# CUDA Nightly
# pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121/
# pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config
@cli99
cli99 / test_fp8.py
Last active August 29, 2024 02:34
vLLM FP8
import os
import time
import torch
import transformers
from torch.profiler import ProfilerActivity, profile, record_function
from vllm import LLM, SamplingParams
os.environ["HOST_IP"] = "10.42.10.16"
@cli99
cli99 / test_torch_compile.py
Created September 18, 2024 00:14
torch.compile
import timeit
import torch
@torch.compile() # 0.103 seconds
# @torch.compile(fullgraph=True) # 0.105 seconds
# @torch.compile(fullgraph=False) # 0.102 seconds
# @torch.compile(options={"triton.cudagraphs": False}, fullgraph=True) # 0.104 seconds
# @torch.compile(
# speechmatics.com/company/articles-and-news/timing-operations-in-pytorch
import time
import torch
# 400000000B/1000000 = 400 MB
a = torch.randn(1000, 1000, device="cuda")
torch.softmax(a, dim=1)
@cli99
cli99 / test_Float8_e4m3fn.py
Last active September 23, 2024 22:08
Float8_e4m3fn
import torch
from torch.utils.cpp_extension import load_inline
finfo = torch.finfo(torch.float8_e4m3fn)
print(f"finfo: {finfo}")
# finfo(resolution=1, min=-448, max=448, eps=0.125, smallest_normal=0.015625, tiny=0.015625, dtype=float8_e4m3fn)
cuda_source = """
C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<c10::Float8_e4m3fn>::max();
void test() {
@cli99
cli99 / get_device_info.py
Created October 3, 2024 07:36
Get CUDA information
import pycuda.autoinit
import pycuda.driver as cuda
# Get the first CUDA device (index 0)
device = cuda.Device(0)
# List of attributes you want to get
attributes = [
cuda.device_attribute.MAX_THREADS_PER_BLOCK,
cuda.device_attribute.MAX_BLOCK_DIM_X,
@cli99
cli99 / get_compiled_triton_code.py
Last active October 9, 2024 04:44
get torch compiled triton code
import torch
from torch._inductor.utils import get_code, get_triton_code
def my_model(x):
return torch.square(x)
compiled_model = torch.compile(my_model)