Skip to content

Instantly share code, notes, and snippets.

View cli99's full-sized avatar
🐼

Cheng Li cli99

🐼
View GitHub Profile
@cli99
cli99 / bet-base-flops.txt
Created June 24, 2022 22:24
bert-base-flops
-------------------------- DeepSpeed Flops Profiler --------------------------
Profile Summary at step 1:
Notations:
data parallel size (dp_size), model parallel size(mp_size),
number of parameters (params), number of multiply-accumulate operations(MACs),
number of floating-point operations (flops), floating-point operations per second (FLOPS),
fwd latency (forward propagation latency), bwd latency (backward propagation latency),
step (weights update latency), iter latency (sum of fwd, bwd and step latency)
@cli99
cli99 / profile_inception_v4.py
Last active June 25, 2022 02:37
profile_inception_v4
import timm
from timm.data import resolve_data_config
from timm.data.transforms_factory import create_transform
import torch
from deepspeed.profiling.flops_profiler import get_model_profile
model = timm.create_model('inception_v4', pretrained=True)
model.eval()
config = resolve_data_config({}, model=model)
#!/bin/sh
# requirements
# - wget
# prepare directory
mkdir -p ~/.local/bin
mkdir -p ~/.config
if ! [ -x "$(command -v tmux)" ]; then
@cli99
cli99 / gist:c95f9ae4a79adab64ebb9d8aedebd52b
Last active September 20, 2022 21:01
ds gpt2 test ar prior
import deepspeed
import transformers
import torch
import datetime
import os
import time
from transformers import pipeline, AutoModel, AutoTokenizer, AutoConfig
import sys
def run_model(name="gpt2", enable_cuda_graph=False, batch=2, seq=65):
from transformers import pipeline, AutoModel, AutoTokenizer, AutoConfig
import transformers
import deepspeed
import torch
import os
local_rank = int(os.getenv('LOCAL_RANK', '0'))
world_size = int(os.getenv('WORLD_SIZE', '1'))
def see_memory_usage():
from csv import QUOTE_NONE
from platform import libc_ver
from transformers import pipeline
import transformers
import deepspeed
import torch
import os
from transformers import BertLayer
from deepspeed.module_inject.replace_policy import HFBertLayerPolicy
@cli99
cli99 / ipc_demo.py
Created September 30, 2022 18:41 — forked from lebedov/ipc_demo.py
Demonstrate how to pass IPC handles to GPU data between processes in Python
#!/usr/bin/env python
"""
Demonstrate how to pass IPC handles to GPU data between processes in Python.
"""
import ctypes
import numpy as np
import multiprocessing as mp
import zmq
@cli99
cli99 / nsight.sh
Created February 27, 2023 22:43 — forked from mcarilli/nsight.sh
Favorite nsight systems profiling commands for Pytorch scripts
# This isn't supposed to run as a bash script, i named it with ".sh" for syntax highlighting.
# https://developer.nvidia.com/nsight-systems
# https://docs.nvidia.com/nsight-systems/profiling/index.html
# My preferred nsys (command line executable used to create profiles) commands
#
# In your script, write
# torch.cuda.nvtx.range_push("region name")
# ...
@cli99
cli99 / fsdp_mnist.py
Created September 26, 2023 18:08
fsdp mnist
# Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py
import os
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import AutoTokenizer
@cli99
cli99 / memory_stats.py
Last active October 26, 2023 23:02
torch.cuda.memory_stats
import torch
import torch.nn as nn
from typing import Optional,Dict,Union,List
from contextlib import nullcontext
_MEMORY_KEYS = {
'allocated_bytes.all.current': 'current_allocated_mem',
'active_bytes.all.current': 'current_active_mem',
'inactive_split_bytes.all.current': 'current_inactive_mem',
'reserved_bytes.all.current': 'current_reserved_mem',