fxmarty

## gist:3595f800150a0e54f037db9829364113
import torch  # just because of cudNN/CUDA dependency not met on the cluster
import time
import onnxruntime as ort
import numpy as np
import multiprocessing as mp

from multiprocessing import Queue

class PickableInferenceSession:
    """

## gist:5b5dbfac7ef4a3ea746effdf146584b6
Some weights of the model checkpoint at fxmarty/tiny-llama-fast-tokenizer were not used when initializing LlamaForCausalLM: ['model.layers.1.self_attn.q_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.0.self_attn.q_proj.weight']
- This IS expected if you are initializing LlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForCausalLM were not initialized from the model checkpoint at fxmarty/tiny-llama-fast-tokenizer and are newly initialized: ['model.layers.0.self_attn.qkv_proj.

## profiling_onnxruntime.py
import torch
import json
import pandas as pd
import matplotlib.pyplot as plt
import os
from pathlib import Path

import onnxruntime

from tqdm import tqdm

## script.py
"""
A minimal script to compare inference with variable batch sizes vs a fixed batch size long enough to handle all cases.

Change `padding_style` to compare.
"""
import torch

from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForSequenceClassification

## benchmark.py
import torch
torch.backends.cuda.matmul.allow_tf32 = True

import argparse
import copy

from tqdm import tqdm
from transformers import AutoModel

import torch._dynamo as dynamo

## Dockerfile
FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04

# to be used along https://github.com/sgugger/torchdynamo-tests
# build with `docker build -f Dockerfile -t container-torchdynamo .`

# run with `docker run --gpus device=4 -it -v $(pwd)/scripts:/workspace container-torchdynamo:latest python verify_dynamo.py`
# and then
# run with `docker run --gpus device=4 -it -v $(pwd)/scripts:/workspace container-torchdynamo:latest python benchmark.py --use-cuda`

# `verify_dynamo.py`: comes from https://github.com/sgugger/torchdynamo-tests

## table_bert_base_half_t4.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                fxmarty
                / table_bert_base_half_t4.md
            
            
              Created
              November 10, 2022 11:43
            
          
batch_size
seq_len
pad_percentage
HF_time
BT_time
Speedup


8
64
0
11.26
6.47
1.74


8
64
0.1
11.09
6.63
1.67


8
64
0.2
11.4
6.56
1.74


8
64
0.5
11.14
6.47
1.72


8
64
0.75
11.57
6.56
1.76


8
128
0
14.26
12.09
1.18


8
128
0.1
14.5
12.21
1.19


8
128
0.2
14.79
10.96
1.35


## table_bert_large_half_t4.md

      
              1 file
            
          
              0 forks
            
          
              0 comments
            
          
              0 stars
            
          
                fxmarty
                / table_bert_large_half_t4.md
            
            
              Last active
              November 10, 2022 11:44
            
          
batch_size
seq_len
pad_percentage
HF_time
BT_time
Speedup


8
64
0
25.16
13.5
1.86


8
64
0.1
24.83
13.8
1.8


8
64
0.2
24.82
13.48
1.84


8
64
0.5
24.6
13.33
1.85


8
64
0.75
24.64
13.04
1.89


8
128
0
25.47
13.46
1.89


8
128
0.1
25.54
13.84
1.85


8
128
0.2
25.62
13.65
1.88
	import torch # just because of cudNN/CUDA dependency not met on the cluster
	import time
	import onnxruntime as ort
	import numpy as np
	import multiprocessing as mp

	from multiprocessing import Queue

	class PickableInferenceSession:
	"""
	Some weights of the model checkpoint at fxmarty/tiny-llama-fast-tokenizer were not used when initializing LlamaForCausalLM: ['model.layers.1.self_attn.q_proj.weight', 'model.layers.0.self_attn.v_proj.weight', 'model.layers.1.self_attn.v_proj.weight', 'model.layers.0.self_attn.k_proj.weight', 'model.layers.1.self_attn.k_proj.weight', 'model.layers.0.self_attn.q_proj.weight']
	- This IS expected if you are initializing LlamaForCausalLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
	- This IS NOT expected if you are initializing LlamaForCausalLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
	Some weights of LlamaForCausalLM were not initialized from the model checkpoint at fxmarty/tiny-llama-fast-tokenizer and are newly initialized: ['model.layers.0.self_attn.qkv_proj.
	import torch
	import json
	import pandas as pd
	import matplotlib.pyplot as plt
	import os
	from pathlib import Path

	import onnxruntime

	from tqdm import tqdm
	"""
	A minimal script to compare inference with variable batch sizes vs a fixed batch size long enough to handle all cases.

	Change `padding_style` to compare.
	"""
	import torch

	from datasets import load_dataset

	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import torch
	torch.backends.cuda.matmul.allow_tf32 = True

	import argparse
	import copy

	from tqdm import tqdm
	from transformers import AutoModel

	import torch._dynamo as dynamo
	FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04

	# to be used along https://github.com/sgugger/torchdynamo-tests
	# build with `docker build -f Dockerfile -t container-torchdynamo .`

	# run with `docker run --gpus device=4 -it -v $(pwd)/scripts:/workspace container-torchdynamo:latest python verify_dynamo.py`
	# and then
	# run with `docker run --gpus device=4 -it -v $(pwd)/scripts:/workspace container-torchdynamo:latest python benchmark.py --use-cuda`

	# `verify_dynamo.py`: comes from https://github.com/sgugger/torchdynamo-tests
batch_size	seq_len	pad_percentage	HF_time	BT_time	Speedup
8	64	0	11.26	6.47	1.74
8	64	0.1	11.09	6.63	1.67
8	64	0.2	11.4	6.56	1.74
8	64	0.5	11.14	6.47	1.72
8	64	0.75	11.57	6.56	1.76
8	128	0	14.26	12.09	1.18
8	128	0.1	14.5	12.21	1.19
8	128	0.2	14.79	10.96	1.35
batch_size	seq_len	pad_percentage	HF_time	BT_time	Speedup
8	64	0	25.16	13.5	1.86
8	64	0.1	24.83	13.8	1.8
8	64	0.2	24.82	13.48	1.84
8	64	0.5	24.6	13.33	1.85
8	64	0.75	24.64	13.04	1.89
8	128	0	25.47	13.46	1.89
8	128	0.1	25.54	13.84	1.85
8	128	0.2	25.62	13.65	1.88