Cheng Li cli99

## using-multiple-github-accounts-with-ssh-keys.md

      
              1 file
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                cli99
                / using-multiple-github-accounts-with-ssh-keys.md
            
            
              Created
              April 18, 2024 00:25
                — forked from oanhnn/using-multiple-github-accounts-with-ssh-keys.md
            
              
                Using multiple github accounts with ssh keys
              
          
    Problem

I have two Github accounts: oanhnn (personal) and superman (for work).
I want to use both accounts on same computer (without typing password everytime, when doing git push or pull).
Solution

Use ssh keys and define host aliases in ssh config file (each alias for an account).
How to?


Generate ssh key pairs for accounts and add them to GitHub accounts.


## gist:41457db461f127f0b97873967be16017
from vllm import LLM, SamplingParams

model_id = "/mnt/workdisk/chengli/models/llama3.1/llama-70b-instruct"
tensor_parallel_size = 4
llm = LLM(
    model=model_id,
    tensor_parallel_size=tensor_parallel_size,
)

prompts = [

## 405b_fp8_cpu_gpu.txt
device_map = {
    "model.embed_tokens": "cpu",
    "model.layers.0": "cpu",
    "model.layers.1": "cpu",
    "model.layers.2": "cpu",
    "model.layers.3": "cpu",
    "model.layers.4": 1,
    "model.layers.5": 1,
    "model.layers.6": 1,
    "model.layers.7": 1,

## test_hf_fbgemm_fp8_quant.py
# https://github.com/huggingface/transpip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/formers/pull/32047

# CUDA Nightly
# pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121/
# pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/

import os

from transformers import AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config

## test_fp8.py
import os
import time

import torch
import transformers
from torch.profiler import ProfilerActivity, profile, record_function

from vllm import LLM, SamplingParams

os.environ["HOST_IP"] = "10.42.10.16"

## test_torch_compile.py
import timeit

import torch


@torch.compile()  # 0.103 seconds
# @torch.compile(fullgraph=True)  # 0.105 seconds
# @torch.compile(fullgraph=False)  #  0.102 seconds
# @torch.compile(options={"triton.cudagraphs": False}, fullgraph=True)  # 0.104 seconds
# @torch.compile(

## test_timing.py
# speechmatics.com/company/articles-and-news/timing-operations-in-pytorch

import time

import torch

# 400000000B/1000000 = 400 MB
a = torch.randn(1000, 1000, device="cuda")

torch.softmax(a, dim=1)

## test_Float8_e4m3fn.py
import torch
from torch.utils.cpp_extension import load_inline

finfo = torch.finfo(torch.float8_e4m3fn)
print(f"finfo: {finfo}")
#  finfo(resolution=1, min=-448, max=448, eps=0.125, smallest_normal=0.015625, tiny=0.015625, dtype=float8_e4m3fn)

cuda_source = """
C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<c10::Float8_e4m3fn>::max();
void test() {

## get_device_info.py
import pycuda.autoinit
import pycuda.driver as cuda

# Get the first CUDA device (index 0)
device = cuda.Device(0)

# List of attributes you want to get
attributes = [
    cuda.device_attribute.MAX_THREADS_PER_BLOCK,
    cuda.device_attribute.MAX_BLOCK_DIM_X,

## get_compiled_triton_code.py
import torch
from torch._inductor.utils import get_code, get_triton_code


def my_model(x):
    return torch.square(x)


compiled_model = torch.compile(my_model)
	from vllm import LLM, SamplingParams

	model_id = "/mnt/workdisk/chengli/models/llama3.1/llama-70b-instruct"
	tensor_parallel_size = 4
	llm = LLM(
	model=model_id,
	tensor_parallel_size=tensor_parallel_size,
	)

	prompts = [
	device_map = {
	"model.embed_tokens": "cpu",
	"model.layers.0": "cpu",
	"model.layers.1": "cpu",
	"model.layers.2": "cpu",
	"model.layers.3": "cpu",
	"model.layers.4": 1,
	"model.layers.5": 1,
	"model.layers.6": 1,
	"model.layers.7": 1,
	# https://github.com/huggingface/transpip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/formers/pull/32047

	# CUDA Nightly
	# pip install --pre torch --index-url https://download.pytorch.org/whl/nightly/cu121/
	# pip install --pre fbgemm-gpu --index-url https://download.pytorch.org/whl/nightly/cu121/

	import os

	from transformers import AutoModelForCausalLM, AutoTokenizer, FbgemmFp8Config
	import os
	import time

	import torch
	import transformers
	from torch.profiler import ProfilerActivity, profile, record_function

	from vllm import LLM, SamplingParams

	os.environ["HOST_IP"] = "10.42.10.16"
	import timeit

	import torch


	@torch.compile() # 0.103 seconds
	# @torch.compile(fullgraph=True) # 0.105 seconds
	# @torch.compile(fullgraph=False) # 0.102 seconds
	# @torch.compile(options={"triton.cudagraphs": False}, fullgraph=True) # 0.104 seconds
	# @torch.compile(
	# speechmatics.com/company/articles-and-news/timing-operations-in-pytorch

	import time

	import torch

	# 400000000B/1000000 = 400 MB
	a = torch.randn(1000, 1000, device="cuda")

	torch.softmax(a, dim=1)
	import torch
	from torch.utils.cpp_extension import load_inline

	finfo = torch.finfo(torch.float8_e4m3fn)
	print(f"finfo: {finfo}")
	# finfo(resolution=1, min=-448, max=448, eps=0.125, smallest_normal=0.015625, tiny=0.015625, dtype=float8_e4m3fn)

	cuda_source = """
	C10_HOST_DEVICE constexpr auto FP8_E4M3_MAX = std::numeric_limits<c10::Float8_e4m3fn>::max();
	void test() {
	import pycuda.autoinit
	import pycuda.driver as cuda

	# Get the first CUDA device (index 0)
	device = cuda.Device(0)

	# List of attributes you want to get
	attributes = [
	cuda.device_attribute.MAX_THREADS_PER_BLOCK,
	cuda.device_attribute.MAX_BLOCK_DIM_X,
	import torch
	from torch._inductor.utils import get_code, get_triton_code


	def my_model(x):
	return torch.square(x)


	compiled_model = torch.compile(my_model)