Skip to content

Instantly share code, notes, and snippets.

@anadim
Created March 15, 2024 03:17
Show Gist options
  • Save anadim/6a7318ac586b5021fc9ace94ab7018c5 to your computer and use it in GitHub Desktop.
Save anadim/6a7318ac586b5021fc9ace94ab7018c5 to your computer and use it in GitHub Desktop.
# Import necessary modules
# !pip install openai==0.28
from decimal import Decimal
import anthropic
import openai
import re
import matplotlib.pyplot as plt
import random
import numpy as np
# Function to generate a prompt for the models
def generate_prompt(a, b):
return f"What is {a} * {b}? No reasoning, just provide the result."
# Function to extract the result from model responses
def extract_result(text):
text = text.replace(",", "")
text = text.replace(".", "")
# Find the last string of digits in the text
match = re.findall(r"\d+", text)
if match:
return int(match[-1])
else:
return None
# Function to test a single model with a given prompt
def test_model(model_name, prompt, anthropic_client):
while True:
try:
if model_name.startswith("claude"):
response = anthropic_client.messages.create(
model=model_name,
max_tokens=1000,
temperature=0.0,
messages=[{"role": "user", "content": prompt}]
)
return response.content[0].text, extract_result(response.content[0].text)
else:
response = openai.ChatCompletion.create(
model=model_name,
messages=[{"role": "user", "content": prompt}],
temperature=0
)
return response.choices[0].message.content, extract_result(response.choices[0].message.content)
except openai.error.APIError as e:
print(f"OpenAI API Error: {str(e)}")
print("Retrying...")
except anthropic.APIError as e:
print(f"Anthropic API Error: {str(e)}")
print("Retrying...")
# time.sleep(5)
except Exception as e:
print(f"Unexpected Error: {str(e)}")
print("Retrying...")
# Function to run the experiment across different models and digit lengths
def run_experiment(models, digit_lengths, anthropic_client, num_samples=100):
results = {model: [] for model in models}
for length in digit_lengths:
print(f"\n\n\n\n ********** Digit Length: {length} **********")
total_relative_error = {model: 0 for model in models}
for sample in range(num_samples):
a = random.randint(10 ** (length - 1), 10 ** length - 1)
b = random.randint(10 ** (length - 1), 10 ** length - 1)
expected_result = a * b
prompt = generate_prompt(a, b)
print(f"\n\nSample {sample + 1}:")
# print(f"Prompt: {prompt}")
print(f"Expected Result: {expected_result}")
for model in models:
model_output, predicted_result = test_model(model, prompt, anthropic_client)
if predicted_result is not None:
relative_error = min(abs(Decimal(predicted_result - expected_result) / Decimal(expected_result)), Decimal(1))
else:
relative_error = 1.0
total_relative_error[model] += relative_error
print(f"\nModel: {model}")
print(f"Output: {model_output}")
print(f"Predicted Result: {predicted_result}")
print(f"\n Relative Error: {relative_error:.4f}")
for model in models:
avg_relative_error = total_relative_error[model] / num_samples
results[model].append(avg_relative_error)
print(f"\nModel: {model}")
print(f"Average Relative Error: {avg_relative_error:.4f}")
return results
# Function to plot the results of the experiment
def plot_results(results, digit_lengths):
claude_colors = ['#FF0000', '#FF4500', '#FF6347'] # Shades of red for Claude models
openai_colors = ['#0000FF', '#1E90FF', '#00BFFF'] # Shades of blue for OpenAI models
markers = ['o', 's', '^', 'D', 'v', '<'] # Different markers for each model
for i, (model, relative_errors) in enumerate(results.items()):
# Create mask for missing data points
mask = [True] * len(digit_lengths)
for j, length in enumerate(digit_lengths):
if j >= len(relative_errors):
mask[j] = False
# Plot only the available data points
x = np.array(digit_lengths)[mask]
y = np.array(relative_errors, dtype=float) # Convert to float data type
# Calculate the standard deviation for each data point
std_dev = np.std(y)
# Determine the color based on the model type
if model.startswith('claude'):
color = claude_colors[i % len(claude_colors)]
else:
color = openai_colors[i % len(openai_colors)]
# Plot the average curve with increased linewidth and marker
plt.plot(x, y, label=model, color=color, linewidth=3, marker=markers[i], markersize=10)
# Clip the error bars to be within the range [0, 1]
lower_bound = np.clip(y - std_dev, 0, 1)
upper_bound = np.clip(y + std_dev, 0, 1)
# Fill the shaded region around the average curve
plt.fill_between(x, lower_bound, upper_bound, color=color, alpha=0.2)
# Increase font sizes
plt.xlabel("Number of Digits", fontsize=16)
plt.ylabel("Relative Error", fontsize=16)
plt.title("Relative Error vs. Number of Digits", fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(fontsize=12)
plt.tight_layout()
# Replace with your actual API keys
anthropic_api_key = "your key buddy"
openai_api_key = "your key buddy"
# Initialize the anthropic client
anthropic_client = anthropic.Anthropic(api_key=anthropic_api_key)
# Set the OpenAI API key
openai.api_key = openai_api_key
# List of models to test
models = ["claude-3-haiku-20240307", "claude-3-sonnet-20240229", "claude-3-opus-20240229", "gpt-3.5-turbo", "gpt-4-0314", "gpt-4-1106-preview"]
# Digit lengths to test
digit_lengths = range(5, 26,1) # Test multiplication for 1 to 10 digits
# Run the experiment
results = run_experiment(models, digit_lengths, anthropic_client)
# Plot the results
plot_results(results, digit_lengths)
@cx0
Copy link

cx0 commented Mar 15, 2024

@anadim Thank you for sharing this experiment! I think it may be interesting to control for length of the predicted output (and perhaps compare the error rate only for the subset where the model outputs the length of predicted output correctly).

Here are two randomly generated 25-digit numbers where each digit is required to be an even number.

even_digits = '02468'
non_zero_even_digits = '2468'

a = int(random.choice(non_zero_even_digits) + ''.join(random.choice(even_digits) for _ in range(length-1)))
b = int(random.choice(non_zero_even_digits) + ''.join(random.choice(even_digits) for _ in range(length-1)))

print(f'a: {a}, b: {b}')

a: 6820648660600460260288444, b: 4204860400244404400286262

If you ask claude-3-haiku-20240307 for their multiplication:

Prompt: What is 6820648660600460260288444 * 4204860400244404400286262? No reasoning, just provide the result.

Response: 28,700,000,000,000,000,000,000,000

When you switch to claude-3-sonnet-20240229, model response repeats 524 consecutively for 997 times before reaching the max token of 1000 to sample:

Response: 28669524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524524

And finally when you use claude-3-opus-20240229, the model outputs 50-digit response as expected, though it is the wrong answer.

Response: 28695427527426761933293024087671111137777777777768

ps. I should do the control experiment but just limiting the input digits to only even numbers show similar pattern of relative error rates across models.

  • gpt-4-0314 not included because deprecated
  • n=20 samples and digit_lengths=range(5, 26, 4) for quick iteration.

LLM_mul_even

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment