Skip to content

Instantly share code, notes, and snippets.

@opyate
Last active May 28, 2023 20:09
Show Gist options
  • Save opyate/601cb777c50063bd0c6bb887359de426 to your computer and use it in GitHub Desktop.
Save opyate/601cb777c50063bd0c6bb887359de426 to your computer and use it in GitHub Desktop.
# from 65B_script.py
{
'model.embed_tokens': 0,
'model.layers.0': 0,
'model.layers.1': 0,
'model.layers.2': 0,
'model.layers.3': 0,
'model.layers.4': 0,
'model.layers.5': 0,
'model.layers.6': 0,
'model.layers.7': 0,
'model.layers.8': 0,
'model.layers.9': 0,
'model.layers.10': 0,
'model.layers.11': 0,
'model.layers.12': 0,
'model.layers.13': 1,
'model.layers.14': 1,
'model.layers.15': 1,
'model.layers.16': 1,
'model.layers.17': 1,
'model.layers.18': 1,
'model.layers.19': 1,
'model.layers.20': 1,
'model.layers.21': 1,
'model.layers.22': 1,
'model.layers.23': 1,
'model.layers.24': 1,
'model.layers.25': 1,
'model.layers.26': 1,
'model.layers.27': 1,
'model.layers.28': 'cpu',
'model.layers.29': 'cpu',
'model.layers.30': 'cpu',
'model.layers.31': 'cpu',
'model.layers.32': 'cpu',
'model.layers.33': 'cpu',
'model.layers.34': 'cpu',
'model.layers.35': 'cpu',
'model.layers.36': 'cpu',
'model.layers.37': 'cpu',
'model.layers.38': 'cpu',
'model.layers.39': 'cpu',
'model.layers.40': 'cpu',
'model.layers.41': 'cpu',
'model.layers.42': 'cpu',
'model.layers.43': 'cpu',
'model.layers.44': 'cpu',
'model.layers.45': 'cpu',
'model.layers.46': 'cpu',
'model.layers.47': 'cpu',
'model.layers.48': 'cpu',
'model.layers.49': 'cpu',
'model.layers.50': 'cpu',
'model.layers.51': 'cpu',
'model.layers.52': 'cpu',
'model.layers.53': 'cpu',
'model.layers.54': 'cpu',
'model.layers.55': 'cpu',
'model.layers.56': 'cpu',
'model.layers.57': 'cpu',
'model.layers.58': 'cpu',
'model.layers.59': 'cpu',
'model.layers.60': 'cpu',
'model.layers.61': 'cpu',
'model.layers.62': 'cpu',
'model.layers.63': 'cpu',
'model.layers.64': 'cpu',
'model.layers.65': 'cpu',
'model.layers.66': 'cpu',
'model.layers.67': 'cpu',
'model.layers.68': 'cpu',
'model.layers.69': 'cpu',
'model.layers.70': 'cpu',
'model.layers.71': 'cpu',
'model.layers.72': 'cpu',
'model.layers.73': 'cpu',
'model.layers.74': 'cpu',
'model.layers.75': 'cpu',
'model.layers.76': 'cpu',
'model.layers.77': 'cpu',
'model.layers.78': 'cpu',
'model.layers.79': 'cpu',
'model.norm': 'cpu',
'lm_head': 'cpu'
}
# from 65B_script8bit.py
# (nothing is offloaded to CPU)
{
"model.embed_tokens": 0,
"model.layers.0": 0,
"model.layers.1": 0,
"model.layers.2": 0,
"model.layers.3": 0,
"model.layers.4": 0,
"model.layers.5": 0,
"model.layers.6": 0,
"model.layers.7": 0,
"model.layers.8": 0,
"model.layers.9": 0,
"model.layers.10": 0,
"model.layers.11": 0,
"model.layers.12": 0,
"model.layers.13": 0,
"model.layers.14": 0,
"model.layers.15": 0,
"model.layers.16": 0,
"model.layers.17": 0,
"model.layers.18": 0,
"model.layers.19": 0,
"model.layers.20": 0,
"model.layers.21": 0,
"model.layers.22": 0,
"model.layers.23": 0,
"model.layers.24": 0,
"model.layers.25": 0,
"model.layers.26": 0,
"model.layers.27": 0,
"model.layers.28": 0,
"model.layers.29": 0,
"model.layers.30": 0,
"model.layers.31": 0,
"model.layers.32": 0,
"model.layers.33": 0,
"model.layers.34": 0,
"model.layers.35": 0,
"model.layers.36": 0,
"model.layers.37": 0,
"model.layers.38": 0,
"model.layers.39": 0,
"model.layers.40": 1,
"model.layers.41": 1,
"model.layers.42": 1,
"model.layers.43": 1,
"model.layers.44": 1,
"model.layers.45": 1,
"model.layers.46": 1,
"model.layers.47": 1,
"model.layers.48": 1,
"model.layers.49": 1,
"model.layers.50": 1,
"model.layers.51": 1,
"model.layers.52": 1,
"model.layers.53": 1,
"model.layers.54": 1,
"model.layers.55": 1,
"model.layers.56": 1,
"model.layers.57": 1,
"model.layers.58": 1,
"model.layers.59": 1,
"model.layers.60": 1,
"model.layers.61": 1,
"model.layers.62": 1,
"model.layers.63": 1,
"model.layers.64": 1,
"model.layers.65": 1,
"model.layers.66": 1,
"model.layers.67": 1,
"model.layers.68": 1,
"model.layers.69": 1,
"model.layers.70": 1,
"model.layers.71": 1,
"model.layers.72": 1,
"model.layers.73": 1,
"model.layers.74": 1,
"model.layers.75": 1,
"model.layers.76": 1,
"model.layers.77": 1,
"model.layers.78": 1,
"model.layers.79": 1,
"model.norm": 1,
"lm_head": 1
}

Checking how device_map="auto" lays out model TheBloke/guanaco-65B-HF in memory on 96GB VRAM workstation (2x 48GB) and ~500GB RAM.

Deps:

pip install -q -U bitsandbytes
pip install -q -U git+https://github.com/huggingface/transformers.git
pip install -q -U git+https://github.com/huggingface/peft.git
pip install -q -U git+https://github.com/huggingface/accelerate.git
pip install scipy

Files:

  • script.py - load 65B model without quantisation
  • output.txt - 65B model layout in memory without quantisation
  • script8bit.py - load 65B model with 8bit quantisation
  • output8bit.txt - 65B model layout in memory with 8bit quantisation
from datasets import load_dataset
from transformers import AutoModelForCausalLM
model_id = "TheBloke/guanaco-65B-HF"
model = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, device_map="auto"
)
model.hf_device_map
# see 65B_output.txt
from datasets import load_dataset
from transformers import AutoModelForCausalLM
model_id = "TheBloke/guanaco-65B-HF"
model = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, device_map="auto",
load_in_8bit=True
)
model.hf_device_map
# see 65B_output8bit.txt
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment