Skip to content

Instantly share code, notes, and snippets.

@proger
Last active March 13, 2024 11:36
Show Gist options
  • Save proger/fe48b82044a02e73bbd1c37209576f1a to your computer and use it in GitHub Desktop.
Save proger/fe48b82044a02e73bbd1c37209576f1a to your computer and use it in GitHub Desktop.
"""
# download the model
huggingface-cli download google/gemma-2b-it
# run the server (set the model name here and in the prompt function below)
# notice --kv_cache_dtype fp8_e5m2
docker run --gpus all -p 8000:8000 -e HF_HOME=/hf -e CUDA_VISIBLE_DEVICES=1 -v ~/.cache/huggingface:/hf vllm/vllm-openai:latest:latest --host 0.0.0.0 --model google/gemma-2b-it --kv-cache-dtype fp8_e5m2
# ask one question
echo 'what is python?' | python -m prompt | jq -r '.choices[].text'
# request many prompts
cat prompts.txt | python -m prompt >> responses.txt
"""
import sys
import requests
import json
def prompt(input, url="http://localhost:8000/v1/completions"):
data = {
"prompt": input,
"max_tokens": 512,
"temperature": 0,
#"temperature": 1.0,
#"top_p": 0.001,
#"top_k": 40,
"model": "google/gemma-2b-it",
"presence_penalty": 0.1,
"use_beam_search": False,
#"n": 1,
#"logprobs": 1,
}
headers = {
"Content-Type": "application/json"
}
response = requests.post(url, headers=headers, data=json.dumps(data))
result = response.json()
return result
for line in sys.stdin:
text = prompt(line.strip())
print(json.dumps(text, ensure_ascii=False))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment