Last active
April 8, 2024 14:05
-
-
Save hiro-v/27d6ab1548b1b29c79c8450d9759444c to your computer and use it in GitHub Desktop.
Profile RAM and NVIDIA GPU VRAM on Windows
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$username = "jan" | |
# Define paths using the username variable | |
$nitroPath1 = "C:\\Users\\$username\\jan\\engines\\nitro-tensorrt-llm\\0.1.8\\ampere\\nitro.exe" | |
$nitroPath2 = "C:\\Users\\$username\\jan\\extensions\\@janhq\\inference-nitro-extension\\dist\\bin\\win-cuda-12-0\\nitro.exe" | |
$modelPath1 = "C:\\Users\\$username\\jan\\models\\mistral-7b-instruct-int4" | |
$modelPath2 = "C:\\Users\\$username\\jan\\models\\mistral-ins-7b-q4\\mistral-7b-instruct-v0.2.Q4_K_M.gguf" | |
# Function to get current RAM and VRAM usage | |
function Get-MemoryUsage { | |
$ram = (Get-Process -Name "nitro" -ErrorAction SilentlyContinue).WS | |
$vramOutput = & "nvidia-smi" --query-gpu=memory.used --format=csv,noheader,nounits | |
Write-Output "VRAM Output: $vramOutput" | |
$vram = if ($vramOutput) { [int]$vramOutput.Trim() } else { 0 } # Default to 0 if null or empty | |
return @{ RAM = $ram; VRAM = $vram } | |
} | |
# Function to perform load model operation and check response | |
function Load-Model { | |
param ( | |
[string]$uri, | |
[string]$body | |
) | |
# Print JSON input in a formatted manner | |
$jsonBody = $body | ConvertFrom-Json | ConvertTo-Json | |
Write-Output "Sending JSON request body:" | |
Write-Output $jsonBody | |
$response = Invoke-WebRequest -Uri $uri -Method Post -ContentType "application/json" -Body $body | |
if ($response.StatusCode -eq 200) { | |
Write-Output "Model loaded successfully." | |
Start-Sleep -Seconds 3 # Ensure the model is ready | |
# Print the response body if status code is 200 | |
$responseContent = $response.Content | ConvertFrom-Json | ConvertTo-Json | |
Write-Output "Response Body:" | |
Write-Output $responseContent | |
} else { | |
Write-Output "Failed to load model. Status code: $($response.StatusCode)" | |
exit | |
} | |
} | |
# Function to start Nitro, perform actions, and monitor memory usage | |
function Start-Nitro { | |
param ( | |
[string]$nitroPath, | |
[string]$modelType | |
) | |
# Start Nitro | |
Start-Process -FilePath $nitroPath | |
# Get Memory usage after starting Nitro | |
Start-Sleep -Seconds 5 | |
$memoryAfterNitro = Get-MemoryUsage | |
Write-Output "RAM after starting Nitro: $($memoryAfterNitro.RAM) bytes" | |
Write-Output "VRAM after starting Nitro: $($memoryAfterNitro.VRAM) bytes" | |
# Determine the correct load model request | |
$webRequestUri = $null | |
$webRequestBody = $null | |
if ($modelType -eq "tensorrt_llm") { | |
$webRequestUri = "http://localhost:3928/inferences/tensorrtllm/loadmodel" | |
$webRequestBody = @" | |
{ | |
"engine_path": "$modelPath1" | |
} | |
"@ | |
} else { | |
$webRequestUri = "http://localhost:3928/inferences/llamacpp/loadmodel" | |
$webRequestBody = @" | |
{ | |
"llama_model_path": "$modelPath2" | |
} | |
"@ | |
} | |
# Load model and ensure it's ready | |
Load-Model -uri $webRequestUri -body $webRequestBody | |
# Monitor memory usage and calculate peak/average | |
$ramReadings = @() | |
$vramReadings = @() | |
$endTime = (Get-Date).AddSeconds(30) | |
while ((Get-Date) -lt $endTime) { | |
Start-Sleep -Seconds 3 | |
$currentMemory = Get-MemoryUsage | |
$ramReadings += $currentMemory.RAM | |
$vramReadings += $currentMemory.VRAM | |
Write-Output "Current RAM: $($currentMemory.RAM) bytes" | |
Write-Output "Current VRAM: $($currentMemory.VRAM) bytes" | |
} | |
# Calculate peak and average for RAM and VRAM | |
$peakRAM = ($ramReadings | Measure-Object -Maximum).Maximum | |
$averageRAM = ($ramReadings | Measure-Object -Average).Average | |
$peakVRAM = ($vramReadings | Measure-Object -Maximum).Maximum | |
$averageVRAM = ($vramReadings | Measure-Object -Average).Average | |
Write-Output "Peak RAM Usage: $peakRAM bytes" | |
Write-Output "Average RAM Usage: $averageRAM bytes" | |
Write-Output "Peak VRAM Usage: $peakVRAM bytes" | |
Write-Output "Average VRAM Usage: $averageVRAM bytes" | |
} | |
# Execute for the first Nitro with type tensorrt_llm | |
# Start-Nitro -nitroPath $nitroPath1 -modelType "tensorrt_llm" | |
# Execute for the second Nitro with type llamacpp | |
Start-Nitro -nitroPath $nitroPath2 -modelType "llamacpp" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
New instruction:
tensorrt-llm extension
and download 2 mistral models (q4 and int4)OPENAI_BASE_URL
tohttp://localhost:3928/v1
(llmperf -token_benchmark_ray.py
)username
. It will try to find binaries underC:\Users\<username>\jan\
Set-ExecutionPolicy RemoteSigned
thenGet-ExecutionPolicy
to verify it's notRestricted
llama.cpp
tensorrt_llm
runs