Skip to content

Instantly share code, notes, and snippets.

View younesbelkada's full-sized avatar
:octocat:
Working from home

Younes Belkada younesbelkada

:octocat:
Working from home
View GitHub Profile
@younesbelkada
younesbelkada / benchmark.py
Last active November 17, 2023 05:30
Benchmarking the pipeline performance of int8 model
import time
import torch
import numpy as np
import argparse
from transformers import pipeline
parser = argparse.ArgumentParser(description='Benchmark pipeline runtime for int8 models')
parser.add_argument('--batch_size', default=1, type=int, help='batch_size for experiments')
parser.add_argument('--nb_runs', default=10, type=int, help='number of times for repeating experiments')
import time
import torch
import numpy as np
import argparse
from transformers import pipeline
parser = argparse.ArgumentParser(description='Benchmark pipeline runtime for int8 models')
parser.add_argument('--batch_size', default=1, type=int, help='batch_size for experiments')
parser.add_argument('--nb_runs', default=10, type=int, help='number of times for repeating experiments')
@younesbelkada
younesbelkada / benchmark_generate.py
Created August 2, 2022 21:35
Benchmark using `generate`
import time
import tokenizers
import torch
import numpy as np
import argparse
from transformers import AutoModelForCausalLM, AutoTokenizer
parser = argparse.ArgumentParser(description='Benchmark pipeline runtime for int8 models')
parser.add_argument('--batch_size', default=1, type=int, help='batch_size for experiments')
@younesbelkada
younesbelkada / benchmark_generate_jz.py
Last active August 12, 2022 09:03
Benchmarking inference
import argparse
import datetime
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--checkpoint", type=str, help="Checkpoint path", required=True)
parser.add_argument("--max-memory-per-gpu", type=str, help="Defines maximum memory allocated to gpu", required=True)
@younesbelkada
younesbelkada / example_code_int8.py
Last active December 9, 2022 08:00
An example script to run bnb int8 models using `bitsandbytes` and `transformers`
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
MAX_NEW_TOKENS = 128
model_name = 'facebook/opt-66b'
text = """
Q: On average Joe throws 25 punches per minute. A fight lasts 5 rounds of 3 minutes.
How many punches did he throw?\n
A: Let’s think step by step.\n"""
@younesbelkada
younesbelkada / shard_weights.py
Last active October 9, 2023 22:17
A script to shard any model on the Hugging Face format
import torch
import os
import json
import argparse
parser = argparse.ArgumentParser(description='Sharding Hugging Face models')
parser.add_argument('--sharding_factor', default=4, type=int, help='Sharding factor - aka how many shards to create')
parser.add_argument('--source_model_path', default="t5-v1_1-xl", type=str, help='Relative path to the source model folder')
parser.add_argument('--sharded_model_path', default="t5-v1_1-xl-sharded", type=str, help='Relative path to the target sharded model folder')
args = parser.parse_args()
@younesbelkada
younesbelkada / bnb_example.py
Created August 13, 2022 18:00
A minimal script to run `bitsandbytes` int8 inference
import torch
import torch.nn as nn
from bitsandbytes.nn import Linear8bitLt
# Utility function
def get_model_memory_footprint(model):
r"""
Partially copied and inspired from: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
@younesbelkada
younesbelkada / save_sequential.py
Created November 8, 2022 15:04
A script to save sequentially any `t5x` checkpoint
from typing import Dict, Union
from sqlalchemy import false
import torch
from transformers.utils.hub import convert_file_size_to_int
from transformers.utils import WEIGHTS_NAME, WEIGHTS_INDEX_NAME
from transformers.modeling_utils import dtype_byte_size
import os
from transformers.models.switch_transformers.convert_switch_transformers_original_flax_checkpoint_to_pytorch import rename_keys
from flax.traverse_util import flatten_dict, unflatten_dict
from tensorflow.io import gfile

Bert large uncased - half - CUDA - T4

batch_size seq_len pad_percentage HF_time BT_time Speedup
8 64 0 0.025156218261718752 0.013504798583984375 1.8627614551432141
8 64 0.1 0.024825302734375 0.013796290283203125 1.7994187005908109
8 64 0.2 0.02481883056640625 0.013484359130859375 1.8405643401774714
8 64 0.5 0.02459789306640625 0.013327769775390625 1.8456120927167883
8 64 0.75 0.02464018310546875 0.01304416259765625 1.8889815977835211
8 128 0 0.02547349609375 0.0134563427734375 1.8930475035188665

bert-base-uncased - A100 - half - cuda

batch_size seq_len pad_percentage HF_time BT_time Speedup
8 64 0 0.012947250976562501 0.007366779174804688 1.7575185395598296
8 64 0.1 0.012887490234375 0.0072092669677734375 1.7876283805252486
8 64 0.2 0.012887449951171874 0.007392645263671876 1.7432799074645124
8 64 0.5 0.012756295166015626 0.007139061889648438 1.7868307297506574
8 64 0.75 0.0132109521484375 0.007268699951171876 1.8175123800931694
8 128 0 0.012552437744140625 0.0075138049316406256 1.6705833939449668