fishlegs fishelegs

## static_kv_cache.py
from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
import torch
from typing import Optional
device = "cuda"

# Copied from the gpt-fast repo
def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
    q = torch.empty_like(probs_sort).exponential_(1)
    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)

## gist:04bb78207daeee1f3d0800dc422e6254
class DeepCacheStandAlone:
    """
    @source https://github.com/horseee/DeepCache
    Standalone version of DeepCache, which can be used without the DeepCacheScript.

    For multiple switching UNets, you can specify cache_type to use different caches.
    Code Snippet:
    ```python
        # U-Net Encoder

## DeepCache.py
'''
https://arxiv.org/abs/2312.00858
1. put this file in ComfyUI/custom_nodes
2. load node from <loaders>

start_step, end_step: apply this method when the timestep is between start_step and end_step
cache_interval: interval of caching (1 means no caching)
cache_depth: depth of caching
'''

## DeepCache.py
'''
https://arxiv.org/abs/2312.00858
1. put this file in ComfyUI/custom_nodes
2. load node from <loaders>

start_step, end_step: apply this method when the timestep is between start_step and end_step
cache_interval: interval of caching (1 means no caching)
cache_depth: depth of caching
'''

## normcore-llm.md

      
              1 file
            
          
              217 forks
            
          
              38 comments
            
          
              2774 stars
            
          
                veekaybee
                / normcore-llm.md
            
            
              Last active
              July 8, 2024 03:50
            
              
                Normcore LLM Reads
              
          
    Anti-hype LLM reading list

Goals: Add links that are reasonable and good explanations of how stuff works. No hype and no vendor content if possible. Practical first-hand accounts of models in prod eagerly sought.
Foundational Concepts


Pre-Transformer Models


## llama-home.md

      
              1 file
            
          
              34 forks
            
          
              20 comments
            
          
              446 stars
            
          
                rain-1
                / llama-home.md
            
            
              Last active
              June 19, 2024 03:05
            
              
                How to run Llama 13B with a 6GB graphics card
              
          
    This worked on 14/May/23. The instructions will probably require updating in the future.

llama is a text prediction model similar to GPT-2, and the version of GPT-3 that has not been fine tuned yet.
It is also possible to run fine tuned versions (like alpaca or vicuna with this. I think. Those versions are more focused on answering questions)

Note: I have been told that this does not support multiple GPUs. It can only use a single GPU.
It is possible to run LLama 13B with a 6GB graphics card now! (e.g. a RTX 2060). Thanks to the amazing work involved in llama.cpp. The latest change is CUDA/cuBLAS which allows you pick an arbitrary number of the transformer layers to be run on the GPU. This is perfect for low VRAM.

Clone llama.cpp from git, I am on commit 08737ef720f0510c7ec2aa84d7f70c691073c35d.


## LLaMA in 60 Lines
import json
import pickle
import struct
import zipfile
import numpy as np
from sentencepiece import SentencePieceProcessor

def rms_norm(x): return (x / np.sqrt(np.square(x).mean(-1, keepdims=True) + 1e-6))
def softmax(x): return (np.exp(x - np.max(x, axis=-1, keepdims=True))) / np.sum((np.exp(x - np.max(x, axis=-1, keepdims=True))), axis=-1, keepdims = True)

## LLaMA in 60 Lines
import json
import pickle
import struct
import zipfile
import numpy as np
from sentencepiece import SentencePieceProcessor

def rms_norm(x): return (x / np.sqrt(np.square(x).mean(-1, keepdims=True) + 1e-6))
def softmax(x): return (np.exp(x - np.max(x, axis=-1, keepdims=True))) / np.sum((np.exp(x - np.max(x, axis=-1, keepdims=True))), axis=-1, keepdims = True)

## Apple_M1_Ultra_v6.2.5-340
Apple M1 Ultra, 20 Core CPU, 48 Core GPU, 64GB of RAM, 1TB SSD

Thanks to @fhlipZero(https://twitter.com/fhlipZero) for running the benchmark on his hardware and allowing me to publish it.
A copy of both a short benchmark and the following full run can be found at https://gist.github.com/fhlip0


hashcat (v6.2.5-340-g98b89e43d) starting in benchmark mode

Benchmarking uses hand-optimized kernel code by default.

## init.lua
local M = {}

local function configure()
  local dap_install = require "dap-install"
  dap_install.setup {
    installation_path = vim.fn.stdpath "data" .. "/dapinstall/",
  }

  local dap_breakpoint = {
    error = {
	from transformers import AutoModelForCausalLM, AutoTokenizer, StaticCache
	import torch
	from typing import Optional
	device = "cuda"

	# Copied from the gpt-fast repo
	def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization
	q = torch.empty_like(probs_sort).exponential_(1)
	return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
	class DeepCacheStandAlone:
	"""
	@source https://github.com/horseee/DeepCache
	Standalone version of DeepCache, which can be used without the DeepCacheScript.

	For multiple switching UNets, you can specify cache_type to use different caches.
	Code Snippet:
	```python
	# U-Net Encoder
	'''
	https://arxiv.org/abs/2312.00858
	1. put this file in ComfyUI/custom_nodes
	2. load node from <loaders>

	start_step, end_step: apply this method when the timestep is between start_step and end_step
	cache_interval: interval of caching (1 means no caching)
	cache_depth: depth of caching
	'''
	import json
	import pickle
	import struct
	import zipfile
	import numpy as np
	from sentencepiece import SentencePieceProcessor

	def rms_norm(x): return (x / np.sqrt(np.square(x).mean(-1, keepdims=True) + 1e-6))
	def softmax(x): return (np.exp(x - np.max(x, axis=-1, keepdims=True))) / np.sum((np.exp(x - np.max(x, axis=-1, keepdims=True))), axis=-1, keepdims = True)
	Apple M1 Ultra, 20 Core CPU, 48 Core GPU, 64GB of RAM, 1TB SSD

	Thanks to @fhlipZero(https://twitter.com/fhlipZero) for running the benchmark on his hardware and allowing me to publish it.
	A copy of both a short benchmark and the following full run can be found at https://gist.github.com/fhlip0



	hashcat (v6.2.5-340-g98b89e43d) starting in benchmark mode

	Benchmarking uses hand-optimized kernel code by default.
	local M = {}

	local function configure()
	local dap_install = require "dap-install"
	dap_install.setup {
	installation_path = vim.fn.stdpath "data" .. "/dapinstall/",
	}

	local dap_breakpoint = {
	error = {