lucataco/runllama2.py

## runllama2.py
import time
import json
import requests


# Start Llama2 13b locally:
# docker run -d -p 5000:5000 --gpus=all r8.im/meta/llama-2-13b@sha256:078d7a002387bd96d93b0302a4c03b3f15824b63104034bfa943c63a8f208c38


url = "http://localhost:5000/predictions"
headers = {
    "Content-Type": "application/json"
}
data = {
    "input": {
        "prompt": "Once upon a time a llama explored",
        "max_new_tokens": 128,
    }
}


t1 = time.time()

for _ in range(1):
    response = requests.post(url, headers=headers, data=json.dumps(data))

t2 = time.time()

# Print out the new tokens
resp = response.json()
output = resp["output"]
# count the number of tokens created
print(len(output))
print(output)


# Print the time difference
print("Time taken: ", t2 - t1)
print("Done")
	import time
	import json
	import requests


	# Start Llama2 13b locally:
	# docker run -d -p 5000:5000 --gpus=all r8.im/meta/llama-2-13b@sha256:078d7a002387bd96d93b0302a4c03b3f15824b63104034bfa943c63a8f208c38


	url = "http://localhost:5000/predictions"
	headers = {
	"Content-Type": "application/json"
	}
	data = {
	"input": {
	"prompt": "Once upon a time a llama explored",
	"max_new_tokens": 128,
	}
	}


	t1 = time.time()

	for _ in range(1):
	response = requests.post(url, headers=headers, data=json.dumps(data))

	t2 = time.time()

	# Print out the new tokens
	resp = response.json()
	output = resp["output"]
	# count the number of tokens created
	print(len(output))
	print(output)


	# Print the time difference
	print("Time taken: ", t2 - t1)
	print("Done")