Luis C. lucataco

## script.py
import torch
import random

random.seed(42)
torch.manual_seed(42)


from transformers import LlamaTokenizer, LlamaForCausalLM
model_path = 'openlm-research/open_llama_3b_v2'
tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=True);

## cog,yaml
build:
  gpu: true
  python_version: "3.10"
  run:
    - apt update -y && apt install -y software-properties-common python3-launchpadlib && apt update -y && add-apt-repository -y ppa:git-core/ppa && apt update -y
    - apt install -y gcc g++ aria2 git git-lfs wget libgl1 libglib2.0-0 ffmpeg cmake libgtk2.0-0 libopenmpi-dev unzip libssl-dev pkg-config tmux ca-certificates
    - wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run && sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
    - git clone https://github.com/aristocratos/btop /content/btop && cd /content/btop && make && make install PREFIX=/usr && rm -rf /content/btop
    - pip install notebook pyngrok pickleshare ipywidgets
    - pip install https://github.com/camenduru/wheels/releases/download/replicate/vllm-0.4.0.post1-cp310-cp310-linux_x86_64.whl

## script.py
import os
import cv2
import argparse
import numpy as np

MONITOR_W = 38.5


def write_depth(depth, bits=1, reverse=True):
    depth_min = depth.min()

## ollama_fast_speech_text_speech.py
""" To use: install Ollama, clone OpenVoice, run this script in the OpenVoice directory
    brew install portaudio
    brew install git-lfs
    git lfs install

    git clone https://github.com/myshell-ai/OpenVoice
    cd OpenVoice
    git clone https://huggingface.co/myshell-ai/OpenVoice
    cp -r OpenVoice/* .


## svd.py
# from cog import BasePredictor, Input, Path
import os
import cv2
import time
import math
import torch
import numpy as np
from PIL import Image
from glob import glob
from typing import Optional

## notes.txt
**Goal**: Run benchmarks of SDXL, SVD, and Llama 13B on an L40S test node

**TL;DR**:

- L40S has same inference speeds as A40 for SDXL
- L40S has 10% faster inference speeds than A40S for llama2
- L40S are ~9% faster at Video rendering than A40s

**Process**: Run non-docker/cog python code for fp16

## sdxl.py
from diffusers import DiffusionPipeline
import torch
import time

# load both base & refiner
t1 = time.time()
base = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
)
base.to("cuda")

## llama2-13b-chat.py
import os
import time
import torch
from typing import Iterator
from threading import Thread
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer

#Change this to 512, 1024, 2048
MAX_NEW_TOKENS = 512
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

## runllama2.py
import time
import json
import requests


# Start Llama2 13b locally:
# docker run -d -p 5000:5000 --gpus=all r8.im/meta/llama-2-13b@sha256:078d7a002387bd96d93b0302a4c03b3f15824b63104034bfa943c63a8f208c38


url = "http://localhost:5000/predictions"

## runSVD.py
import io
import time
import json
import base64
import requests

# Start SDXL locally:
# docker run -d -p 5000:5000 --gpus=all r8.im/stability-ai/stable-video-diffusion@sha256:3f0457e4619daac51203dedb472816fd4af51f3149fa7a9e0b5ffcf1b8172438
	import torch
	import random

	random.seed(42)
	torch.manual_seed(42)


	from transformers import LlamaTokenizer, LlamaForCausalLM
	model_path = 'openlm-research/open_llama_3b_v2'
	tokenizer = LlamaTokenizer.from_pretrained(model_path, legacy=True);
	build:
	gpu: true
	python_version: "3.10"
	run:
	- apt update -y && apt install -y software-properties-common python3-launchpadlib && apt update -y && add-apt-repository -y ppa:git-core/ppa && apt update -y
	- apt install -y gcc g++ aria2 git git-lfs wget libgl1 libglib2.0-0 ffmpeg cmake libgtk2.0-0 libopenmpi-dev unzip libssl-dev pkg-config tmux ca-certificates
	- wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run && sh cuda_12.1.0_530.30.02_linux.run --silent --toolkit
	- git clone https://github.com/aristocratos/btop /content/btop && cd /content/btop && make && make install PREFIX=/usr && rm -rf /content/btop
	- pip install notebook pyngrok pickleshare ipywidgets
	- pip install https://github.com/camenduru/wheels/releases/download/replicate/vllm-0.4.0.post1-cp310-cp310-linux_x86_64.whl
	import os
	import cv2
	import argparse
	import numpy as np

	MONITOR_W = 38.5


	def write_depth(depth, bits=1, reverse=True):
	depth_min = depth.min()
	""" To use: install Ollama, clone OpenVoice, run this script in the OpenVoice directory
	brew install portaudio
	brew install git-lfs
	git lfs install

	git clone https://github.com/myshell-ai/OpenVoice
	cd OpenVoice
	git clone https://huggingface.co/myshell-ai/OpenVoice
	cp -r OpenVoice/* .
	# from cog import BasePredictor, Input, Path
	import os
	import cv2
	import time
	import math
	import torch
	import numpy as np
	from PIL import Image
	from glob import glob
	from typing import Optional
	Goal: Run benchmarks of SDXL, SVD, and Llama 13B on an L40S test node

	TL;DR:

	- L40S has same inference speeds as A40 for SDXL
	- L40S has 10% faster inference speeds than A40S for llama2
	- L40S are ~9% faster at Video rendering than A40s

	Process: Run non-docker/cog python code for fp16
	from diffusers import DiffusionPipeline
	import torch
	import time

	# load both base & refiner
	t1 = time.time()
	base = DiffusionPipeline.from_pretrained(
	"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
	)
	base.to("cuda")
	import time
	import json
	import requests


	# Start Llama2 13b locally:
	# docker run -d -p 5000:5000 --gpus=all r8.im/meta/llama-2-13b@sha256:078d7a002387bd96d93b0302a4c03b3f15824b63104034bfa943c63a8f208c38


	url = "http://localhost:5000/predictions"
	import io
	import time
	import json
	import base64
	import requests

	# Start SDXL locally:
	# docker run -d -p 5000:5000 --gpus=all r8.im/stability-ai/stable-video-diffusion@sha256:3f0457e4619daac51203dedb472816fd4af51f3149fa7a9e0b5ffcf1b8172438