Skip to content

Instantly share code, notes, and snippets.

@makispl
Created December 16, 2023 13:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save makispl/93738b807dbaaeb8bf4c5ea9375c62d9 to your computer and use it in GitHub Desktop.
Save makispl/93738b807dbaaeb8bf4c5ea9375c62d9 to your computer and use it in GitHub Desktop.
embedded_text = ''
for q in query_res:
embedded_text += '\n'.join(q['text'].split("\'\'"))
# check all of the pertinent HuggingFace models for performance
models = ["llmware/bling-1b-0.1",
"llmware/bling-1.4b-0.1",
"llmware/bling-falcon-1b-0.1",
"llmware/bling-cerebras-1.3b-0.1",
"llmware/bling-sheared-llama-1.3b-0.1",
"llmware/bling-sheared-llama-2.7b-0.1",
"llmware/bling-red-pajamas-3b-0.1",
]
# iterate through each model, prompt them and get the answer
for model in models:
t0 = time.time()
print(f"\n > Loading Model: {model}...")
prompter = Prompt().load_model(model, from_hf=True, api_key="")
t1 = time.time()
print(f"\n > Model {model} load time: {t1-t0} seconds")
print(f"Query: {query}")
output = prompter.prompt_main(query, context=embedded_text
, prompt_name="default_with_context",temperature=0.0)
llm_response = output["llm_response"].strip("\n")
print(f"\n > LLM Response: {llm_response}")
print(f"\n > LLM Usage: {output['usage']}")
t2 = time.time()
print(f"\nTotal processing time: {t2-t1} seconds")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment