kwindla/benchmark-llama3-ttfb.py

## benchmark-llama3-ttfb.py
import os
import json
import time
import statistics

from groq import Groq

# Set the Groq API key and the number of inferences to run
GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
NUM_INFERENCES = 10

# Create a Groq client instance
client = Groq(api_key=GROQ_API_KEY)

# Create a list to store the TTFB results
ttfb_results = []

for _ in range(NUM_INFERENCES):
    # Set the input prompt (replace with your own input)
    input_prompt = "Tell me about Hamlet's state of mind when he said \"to be or not to be?\""

    # Measure the time before sending the request
    start_time = time.time()

    # Send the request to the Groq model
    with client.chat.completions.with_streaming_response.create(
        messages=[
            {
                "role": "user",
                "content": input_prompt,
            }
        ],
        model="llama3-70b-8192",
        stream=True,
    ) as response:
        for line in response.iter_lines():
          print(line)
          break

    # Measure the time after receiving the response
    end_time = time.time()

    # Calculate the TTFB and add it to the results list
    ttfb = end_time - start_time
    ttfb_results.append(ttfb * 1000)  # Convert to milliseconds

# Calculate and display the statistical overview
mean_ttfb = statistics.mean(ttfb_results)
median_ttfb = statistics.median(ttfb_results)
stddev_ttfb = statistics.stdev(ttfb_results)
min_ttfb = min(ttfb_results)
max_ttfb = max(ttfb_results)

print(f"Mean TTFB: {mean_ttfb:.2f} ms")
print(f"Median TTFB: {median_ttfb:.2f} ms")
print(f"Standard Deviation: {stddev_ttfb:.2f} ms")
print(f"Min TTFB: {min_ttfb:.2f} ms")
print(f"Max TTFB: {max_ttfb:.2f} ms")
	import os
	import json
	import time
	import statistics

	from groq import Groq

	# Set the Groq API key and the number of inferences to run
	GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
	NUM_INFERENCES = 10

	# Create a Groq client instance
	client = Groq(api_key=GROQ_API_KEY)

	# Create a list to store the TTFB results
	ttfb_results = []

	for _ in range(NUM_INFERENCES):
	# Set the input prompt (replace with your own input)
	input_prompt = "Tell me about Hamlet's state of mind when he said \"to be or not to be?\""

	# Measure the time before sending the request
	start_time = time.time()

	# Send the request to the Groq model
	with client.chat.completions.with_streaming_response.create(
	messages=[
	{
	"role": "user",
	"content": input_prompt,
	}
	],
	model="llama3-70b-8192",
	stream=True,
	) as response:
	for line in response.iter_lines():
	print(line)
	break

	# Measure the time after receiving the response
	end_time = time.time()

	# Calculate the TTFB and add it to the results list
	ttfb = end_time - start_time
	ttfb_results.append(ttfb * 1000) # Convert to milliseconds

	# Calculate and display the statistical overview
	mean_ttfb = statistics.mean(ttfb_results)
	median_ttfb = statistics.median(ttfb_results)
	stddev_ttfb = statistics.stdev(ttfb_results)
	min_ttfb = min(ttfb_results)
	max_ttfb = max(ttfb_results)

	print(f"Mean TTFB: {mean_ttfb:.2f} ms")
	print(f"Median TTFB: {median_ttfb:.2f} ms")
	print(f"Standard Deviation: {stddev_ttfb:.2f} ms")
	print(f"Min TTFB: {min_ttfb:.2f} ms")
	print(f"Max TTFB: {max_ttfb:.2f} ms")