makispl/rag.py

## rag.py
embedded_text = ''
for q in query_res:
   embedded_text += '\n'.join(q['text'].split("\'\'"))


# check all of the pertinent HuggingFace models for performance
models = ["llmware/bling-1b-0.1",
             "llmware/bling-1.4b-0.1",
             "llmware/bling-falcon-1b-0.1",
             "llmware/bling-cerebras-1.3b-0.1",
             "llmware/bling-sheared-llama-1.3b-0.1",
             "llmware/bling-sheared-llama-2.7b-0.1",
             "llmware/bling-red-pajamas-3b-0.1",
             ]

# iterate through each model, prompt them and get the answer
for model in models:
    t0 = time.time()
    print(f"\n > Loading Model: {model}...")
    prompter = Prompt().load_model(model, from_hf=True, api_key="")

    t1 = time.time()
    print(f"\n > Model {model} load time: {t1-t0} seconds")

    print(f"Query: {query}")
    output = prompter.prompt_main(query, context=embedded_text
                                 , prompt_name="default_with_context",temperature=0.0)

    llm_response = output["llm_response"].strip("\n")
    print(f"\n > LLM Response: {llm_response}")
    print(f"\n > LLM Usage: {output['usage']}")

    t2 = time.time()
    print(f"\nTotal processing time: {t2-t1} seconds")
	embedded_text = ''
	for q in query_res:
	embedded_text += '\n'.join(q['text'].split("\'\'"))


	# check all of the pertinent HuggingFace models for performance
	models = ["llmware/bling-1b-0.1",
	"llmware/bling-1.4b-0.1",
	"llmware/bling-falcon-1b-0.1",
	"llmware/bling-cerebras-1.3b-0.1",
	"llmware/bling-sheared-llama-1.3b-0.1",
	"llmware/bling-sheared-llama-2.7b-0.1",
	"llmware/bling-red-pajamas-3b-0.1",
	]

	# iterate through each model, prompt them and get the answer
	for model in models:
	t0 = time.time()
	print(f"\n > Loading Model: {model}...")
	prompter = Prompt().load_model(model, from_hf=True, api_key="")

	t1 = time.time()
	print(f"\n > Model {model} load time: {t1-t0} seconds")

	print(f"Query: {query}")
	output = prompter.prompt_main(query, context=embedded_text
	, prompt_name="default_with_context",temperature=0.0)

	llm_response = output["llm_response"].strip("\n")
	print(f"\n > LLM Response: {llm_response}")
	print(f"\n > LLM Usage: {output['usage']}")

	t2 = time.time()
	print(f"\nTotal processing time: {t2-t1} seconds")