proger/prompt.py

## prompt.py
"""
# download the model
huggingface-cli download google/gemma-2b-it

# run the server (set the model name here and in the prompt function below)
# notice --kv_cache_dtype fp8_e5m2
docker run --gpus all -p 8000:8000 -e HF_HOME=/hf -e CUDA_VISIBLE_DEVICES=1 -v ~/.cache/huggingface:/hf vllm/vllm-openai:latest:latest --host 0.0.0.0 --model google/gemma-2b-it --kv-cache-dtype fp8_e5m2

# ask one question
echo 'what is python?' | python -m prompt | jq -r '.choices[].text'

# request many prompts
cat prompts.txt | python -m prompt >> responses.txt
"""

import sys
import requests
import json

def prompt(input, url="http://localhost:8000/v1/completions"):
    data = {
        "prompt": input,
        "max_tokens": 512,
        "temperature": 0,
        #"temperature": 1.0,
        #"top_p": 0.001,
        #"top_k": 40,
        "model": "google/gemma-2b-it",
        "presence_penalty": 0.1,
        "use_beam_search": False,
        #"n": 1,
        #"logprobs": 1,
    }
    headers = {
        "Content-Type": "application/json"
    }
    response = requests.post(url, headers=headers, data=json.dumps(data))
    result = response.json()
    return result


for line in sys.stdin:
    text = prompt(line.strip())
    print(json.dumps(text, ensure_ascii=False))
	"""
	# download the model
	huggingface-cli download google/gemma-2b-it

	# run the server (set the model name here and in the prompt function below)
	# notice --kv_cache_dtype fp8_e5m2
	docker run --gpus all -p 8000:8000 -e HF_HOME=/hf -e CUDA_VISIBLE_DEVICES=1 -v ~/.cache/huggingface:/hf vllm/vllm-openai:latest:latest --host 0.0.0.0 --model google/gemma-2b-it --kv-cache-dtype fp8_e5m2

	# ask one question
	echo 'what is python?' \| python -m prompt \| jq -r '.choices[].text'

	# request many prompts
	cat prompts.txt \| python -m prompt >> responses.txt
	"""

	import sys
	import requests
	import json

	def prompt(input, url="http://localhost:8000/v1/completions"):
	data = {
	"prompt": input,
	"max_tokens": 512,
	"temperature": 0,
	#"temperature": 1.0,
	#"top_p": 0.001,
	#"top_k": 40,
	"model": "google/gemma-2b-it",
	"presence_penalty": 0.1,
	"use_beam_search": False,
	#"n": 1,
	#"logprobs": 1,
	}
	headers = {
	"Content-Type": "application/json"
	}
	response = requests.post(url, headers=headers, data=json.dumps(data))
	result = response.json()
	return result


	for line in sys.stdin:
	text = prompt(line.strip())
	print(json.dumps(text, ensure_ascii=False))