Created
March 24, 2023 22:35
-
-
Save kinoc/8a042d8c5683725aa8c372274c02ea2f to your computer and use it in GitHub Desktop.
Flask based endpoint to emulate OpenAI API enpoints using llama/alpaca and HF models
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# a simple Flask API to emulate OpenAI's using llama models and/or transformers | |
# runs on 3080 | |
import sys | |
import time | |
import torch | |
import json | |
from peft import PeftModel | |
from flask import Flask, make_response, request, abort | |
from flask.json import jsonify | |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM | |
from huggingface_hub import scan_cache_dir | |
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig | |
# tested on a 3080 | |
LOAD_8BIT = False | |
BASE_MODEL = "decapoda-research/llama-7b-hf" | |
LORA_WEIGHTS = "tloen/alpaca-lora-7b" | |
# clues from : | |
# https://github.com/shawwn/openai-server | |
# https://github.com/jquesnelle/transformers-openai-api | |
# https://github.com/facebookresearch/metaseq | |
# https://github.com/tloen/alpaca-lora | |
# requirement: pip3 install transformers huggingface_hub flask | |
# requirement: pip3 install sentencepiece | |
# requirement: pip3 install git+https://github.com/huggingface/transformers.git | |
# requirement: pip3 install accelerate | |
# requirement: pip3 install bitsandbytes | |
# requirement: pip3 install git+https://github.com/huggingface/peft.git | |
# requirement: pip3 install loralib | |
# set up the Flask application | |
app = Flask(__name__) | |
cached_model="" | |
tokenizer=None | |
model=None | |
models = {} | |
llamaModels =[ | |
'llama-7b-hf', | |
'alpaca-7b-hf', | |
'decapoda-research/llama-7b-hf', | |
'tloen/alpaca-lora-7b', | |
'decapoda-research/llama-7b-hf-int4', | |
'decapoda-research/llama-13b-hf-int4', | |
'decapoda-research/llama-65b-hf-int4', | |
'decapoda-research/llama-30b-hf-int4', | |
'decapoda-research/llama-30b-hf', | |
'decapoda-research/llama-65b-hf', | |
'decapoda-research/llama-13b-hf', | |
'decapoda-research/llama-smallint-pt', | |
'decapoda-research/llama-7b-hf-int8', | |
] | |
# collect the models available in the cache | |
report = scan_cache_dir() | |
modelList = [] | |
for repo in report.repos: | |
print("repo_id:",json.dumps(repo.repo_id,indent=4)) | |
print("repo_type:",json.dumps(repo.repo_type,indent=4)) | |
print("repo_path:",json.dumps(str(repo.repo_path),indent=4)) | |
#print("revisions",json.dumps(str(repo.revisions),indent=4)) | |
print("size_on_disk:",json.dumps(repo.size_on_disk,indent=4)) | |
print("nb_files:",json.dumps(repo.nb_files,indent=4)) | |
#print(json.dumps(repo.str(refs),indent=4)) | |
alias = repo.repo_id | |
if ('/' in repo.repo_id): | |
alias = repo.repo_id.split('/')[1] | |
modelList.append(alias) | |
models[alias] = repo.repo_id | |
print() | |
for modelname in llamaModels: | |
alias = modelname | |
if ('/' in modelname): | |
alias = modelname.split('/')[1] | |
models[alias] = modelname | |
modelList.append(alias) | |
modelList.sort() | |
print("Available models:") | |
for model in modelList: | |
print(model) | |
# find out which device we are using | |
if torch.cuda.is_available(): | |
device = "cuda" | |
else: | |
device = "cpu" | |
try: | |
if torch.backends.mps.is_available(): | |
device = "mps" | |
except: | |
pass | |
print("Using device: {}".format(device)) | |
#set up the llama model | |
if device == "cuda": | |
lmodel = LlamaForCausalLM.from_pretrained( | |
BASE_MODEL, | |
load_in_8bit=LOAD_8BIT, | |
torch_dtype=torch.float16, | |
device_map="auto", | |
resume_download=True | |
) | |
lmodel = PeftModel.from_pretrained( | |
lmodel, | |
LORA_WEIGHTS, | |
torch_dtype=torch.float16, | |
) | |
elif device == "mps": | |
lmodel = LlamaForCausalLM.from_pretrained( | |
BASE_MODEL, | |
device_map={"": device}, | |
torch_dtype=torch.float16, | |
resume_download=True | |
) | |
lmodel = PeftModel.from_pretrained( | |
lmodel, | |
LORA_WEIGHTS, | |
device_map={"": device}, | |
torch_dtype=torch.float16, | |
) | |
else: | |
lmodel = LlamaForCausalLM.from_pretrained( | |
BASE_MODEL, device_map={"": device}, low_cpu_mem_usage=True | |
) | |
lmodel = PeftModel.from_pretrained( | |
lmodel, | |
LORA_WEIGHTS, | |
device_map={"": device}, | |
resume_download=True | |
) | |
ltokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf",resume_download=True) | |
def generate_prompt_llama(instruction, input=None): | |
if input: | |
return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. | |
### Instruction: | |
{instruction} | |
### Input: | |
{input} | |
### Response:""" | |
else: | |
return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. | |
### Instruction: | |
{instruction} | |
### Response:""" | |
if not LOAD_8BIT: | |
lmodel.half() # seems to fix bugs for some users. | |
lmodel.eval() | |
if torch.__version__ >= "2" and sys.platform != "win32": | |
model = torch.compile(model) | |
def evaluate_llama( | |
instruction, | |
input=None, | |
temperature=0.1, | |
top_p=0.75, | |
top_k=40, | |
num_beams=1, | |
max_new_tokens=128, | |
**kwargs, | |
): | |
prompt = generate_prompt_llama(instruction, input) | |
print(f"prompt: {prompt}") | |
print(f"temperature: {temperature}") | |
print(f"top_p: {top_p}") | |
print(f"top_k: {top_k}") | |
print(f"num_beams: {num_beams}") | |
print(f"max_new_tokens: {max_new_tokens}") | |
inputs = ltokenizer(prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(device) | |
generation_config = GenerationConfig( | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
num_beams=num_beams, | |
**kwargs, | |
) | |
with torch.no_grad(): | |
generation_output = lmodel.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
return_dict_in_generate=True, | |
output_scores=True, | |
max_new_tokens=max_new_tokens, | |
) | |
s = generation_output.sequences[0] | |
output = ltokenizer.decode(s) | |
print(f"output: {output}") | |
gen_text = output.split("### Response:")[1].strip() | |
print(f"gen_text: {gen_text}") | |
return gen_text | |
#return output.split("### Response:")[1].strip() | |
def update_model(model_name): | |
global cached_model,llamaModels,ltokenizer,lmodel | |
# is it an alias? | |
if (model_name in models): | |
model_name = models[model_name] | |
if (model_name in llamaModels) and (model_name != cached_model): | |
print("Using llama model: {}".format(model_name)) | |
tokenizer = ltokenizer | |
model = lmodel | |
return ltokenizer, lmodel | |
if model_name != cached_model: | |
print("Loading model: {}".format(model_name)) | |
cached_model = model_name | |
tokenizer = AutoTokenizer.from_pretrained(model_name,resume_download=True) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_name,resume_download=True) | |
model.to("cuda") | |
return tokenizer, model | |
def decode_kwargs(data): | |
# map the data to the kwargs (openai to huggingface) | |
kwargs = {} | |
if 'n' in data: | |
kwargs['num_return_sequences'] = data['n'] | |
if 'stop' in data: | |
kwargs['early_stopping'] = True | |
kwargs['stop_token'] = data['stop'] | |
if 'suffix' in data: | |
kwargs['suffix'] = data['suffix'] | |
if 'presence_penalty' in data: | |
kwargs['presence_penalty'] = data['presence_penalty'] | |
if 'frequency_penalty' in data: | |
kwargs['repetition_penalty'] = data['frequency_penalty'] | |
if 'repetition_penalty ' in data: | |
kwargs['repetition_penalty'] = data['repetition_penalty '] | |
if 'best_of ' in data: | |
kwargs['num_return_sequences'] = data['best_of '] | |
#kwargs['do_sample'] = True | |
#for key, value in data.items(): | |
# if key in ["temperature", "top_p", "top_k", "num_beams", "max_new_tokens"]: | |
# kwargs[key] = value | |
return kwargs | |
# define the completion endpoint | |
@app.route("/v1/engines/<model_name>/completions", methods=["POST"]) | |
def completions(model_name): | |
# get the request data | |
data = request.get_json(force=True) | |
# is it an alias? | |
if (model_name in models): | |
model_name = models[model_name] | |
#update model | |
tokenizer, model = update_model(model_name) | |
# get the prompt and other parameters from the request data | |
prompt = data["prompt"] | |
max_tokens = data.get("max_tokens", 16) | |
temperature = data.get("temperature", 1.0) | |
top_p = data.get("top_p", 0.75) | |
top_k = data.get("top_k", 40) | |
num_beams = data.get("num_beams", 1) | |
max_new_tokens = data.get("max_new_tokens", 256) | |
kwargs = decode_kwargs(data) | |
# generate the completion | |
if (model_name in llamaModels): | |
#generated_text = evaluate_llama(prompt,**kwargs) | |
generated_text = evaluate_llama(prompt, | |
#input = prompt, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
num_beams=num_beams, | |
max_new_tokens=max_new_tokens, | |
**kwargs) | |
else: | |
input_ids = tokenizer.encode(prompt, return_tensors='pt') | |
output = model.generate(input_ids=input_ids, | |
max_length=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
**kwargs) | |
generated_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
prompt_tokens = len(tokenizer.encode(prompt)) | |
completion_tokens = len(tokenizer.encode(generated_text)) | |
total_tokens = prompt_tokens + completion_tokens | |
return jsonify( { | |
'object': 'text_completion', | |
'id': 'dummy', | |
'created': int(time.time()), | |
'model': model_name, | |
'choices': | |
[{'text': generated_text, 'finish_reason': 'length'}], | |
'usage': { | |
'prompt_tokens': prompt_tokens, | |
'completion_tokens': completion_tokens, | |
'total_tokens': total_tokens | |
} | |
} | |
) | |
# return the response data | |
# return jsonify(response.choices[0].text) | |
@app.route("/v1/chat/completions", methods=["POST"]) | |
def chat_completions(): | |
# get the request data | |
data = request.get_json(force=True) | |
model_name = data["model"] | |
messages = data["messages"] | |
# generate prompt from messages | |
# messages must be an array of message objects, where each object has a role (either "system", "user", or "assistant") and content (the content of the message). | |
prompt = "" | |
for message in messages: | |
prompt += message["role"] + ": " + message["content"] + "\n" | |
#prompt += "assistant: " | |
# is it an alias? | |
if (model_name in models): | |
model_name = models[model_name] | |
#update model | |
tokenizer, model = update_model(model_name) | |
# get the prompt and other parameters from the request data | |
#prompt = data["prompt"] | |
max_tokens = data.get("max_tokens", 16) | |
temperature = data.get("temperature", 1.0) | |
top_p = data.get("top_p", 0.75) | |
top_k = data.get("top_k", 40) | |
num_beams = data.get("num_beams", 1) | |
max_new_tokens = data.get("max_new_tokens", 256) | |
kwargs = decode_kwargs(data) | |
if (model_name in llamaModels): | |
#generated_text = evaluate_llama_chat(prompt,**kwargs) | |
instruction = "Be a generallly helpful assistiang chatting with the user. Return the response for the assistant." | |
generated_text = evaluate_llama(instruction, | |
input = prompt, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
num_beams=num_beams, | |
max_new_tokens=max_new_tokens, | |
**kwargs) | |
else: | |
input_ids = tokenizer.encode(prompt, return_tensors='pt') | |
output = model.generate(input_ids=input_ids, | |
max_length=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
**kwargs) | |
generated_text = tokenizer.decode(output[0], skip_special_tokens=True) | |
prompt_tokens = len(tokenizer.encode(prompt)) | |
completion_tokens = len(tokenizer.encode(generated_text)) | |
total_tokens = prompt_tokens + completion_tokens | |
return jsonify( { | |
'object': 'text_completion', | |
'id': 'dummy', | |
'created': int(time.time()), | |
'model': model_name, | |
'choices': | |
[{'role':'assistant','content': generated_text, 'finish_reason': 'stop'}], | |
'usage': { | |
'prompt_tokens': prompt_tokens, | |
'completion_tokens': completion_tokens, | |
'total_tokens': total_tokens | |
} | |
} | |
) | |
# return the response data | |
# return jsonify(response.choices[0].text) | |
@app.route('/v1/completions', methods=['POST']) | |
def v1_completions(): | |
print("COMPLETION REQUEST", request.json) | |
return completions(request.json['model']) | |
# define the engines endpoint | |
@app.route('/v1/engines') | |
@app.route('/v1/models') | |
def v1_engines(): | |
return make_response(jsonify({ | |
'data': [{ | |
'object': 'engine', | |
'id': id, | |
'ready': True, | |
'owner': 'huggingface', | |
'permissions': None, | |
'created': None | |
} for id in models.keys()] | |
})) | |
if __name__ == "__main__": | |
app.run() | |
""" | |
curl http://127.0.0.1:5000/v1/completions -v -H "Content-Type: application/json" -H "Authorization: Bearer $OPENAI_API_KEY" --data "{\"model\":\"alpaca-lora-7b\",\"prompt\":\"Say this is a test\",\"max_tokens\":7,\"temperature\":0}" | |
* Trying 127.0.0.1:5000... | |
* Connected to 127.0.0.1 (127.0.0.1) port 5000 (#0) | |
> POST /v1/completions HTTP/1.1 | |
> Host: 127.0.0.1:5000 | |
> User-Agent: curl/7.83.1 | |
> Accept: */* | |
> Content-Type: application/json | |
> Authorization: Bearer $OPENAI_API_KEY | |
> Content-Length: 87 | |
> | |
* Mark bundle as not supporting multiuse | |
< HTTP/1.1 200 OK | |
< Server: Werkzeug/2.2.3 Python/3.10.9 | |
< Date: Fri, 24 Mar 2023 22:19:13 GMT | |
< Content-Type: application/json | |
< Content-Length: 226 | |
< Connection: close | |
< | |
{"choices":[{"finish_reason":"length","text":"This is a test."}],"created":1679696353,"id":"dummy","model":"tloen/alpaca-lora-7b","object":"text_completion","usage":{"completion_tokens":6,"prompt_tokens":6,"total_tokens":12}} | |
* Closing connection 0 | |
curl http://127.0.0.1:5000/v1/chat/completions -v -H "Content-Type: application/json" -H "Authorization: Bearer $OPENAI_API_KEY" --data "{\"model\":\"alpaca-lora-7b\",\"max_tokens\":64,\"temperature\":0.95, \"messages\": [{\"role\": \"user\", \"content\": \"Hello!\"}]}" | |
* Trying 127.0.0.1:5000... | |
* Connected to 127.0.0.1 (127.0.0.1) port 5000 (#0) | |
> POST /v1/chat/completions HTTP/1.1 | |
> Host: 127.0.0.1:5000 | |
> User-Agent: curl/7.83.1 | |
> Accept: */* | |
> Content-Type: application/json | |
> Authorization: Bearer $OPENAI_API_KEY | |
> Content-Length: 115 | |
> | |
* Mark bundle as not supporting multiuse | |
< HTTP/1.1 200 OK | |
< Server: Werkzeug/2.2.3 Python/3.10.9 | |
< Date: Fri, 24 Mar 2023 22:25:01 GMT | |
< Content-Type: application/json | |
< Content-Length: 257 | |
< Connection: close | |
< | |
{"choices":[{"content":"Hello! How can I help you?","finish_reason":"stop","role":"assistant"}],"created":1679696701,"id":"dummy","model":"tloen/alpaca-lora-7b","object":"text_completion","usage":{"completion_tokens":9,"prompt_tokens":6,"total_tokens":15}} | |
* Closing connection 0 | |
curl http://127.0.0.1:5000/v1/models | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment