Recently, I learned that some of the top reward models on RewardBench were trained on a preference dataset that has unintentional contamination with the benchmark. The dataset, Skyworks Preferences 80k contains contamination by mixing a Magpie dataset in. Magpie is a new method for having language models generate instructions by prompting them with an empty chat template. The source for the Skyworks dataset that was contaminated is Argilla/magpie-ultra-v0.1, generated with Llama 3.1 405B Instruct. I would never expect a Magpie dataset to be contaminated.
What seems likely is that Meta trained on some these prompts, but the exact provenance of each prompt needs more example. For example, we learned that some of the prompts we used in our LLMBar subsets they got from popular training sets like Alpaca, rather than another evaluation set. Here are some example matches between RewardBench and SkyWorks
Match 1 (Score: 2128.00):
RewardBench (2838): We're gonna play a game. It's a kind of "20 Questions" but about my personality. Where I'm gonna sta...
Skyworks (1078): We're gonna play a game. It's a kind of "20 Questions" but about my personality. Where I'm gonna sta...
Match 2 (Score: 350.00):
RewardBench (2829): I want you to act as a novelist. You will come up with creative and captivating stories that can eng...
Skyworks (4257): I want you to act as a novelist. You will come up with creative and captivating stories that can eng...
Match 3 (Score: 294.00):
RewardBench (214): I want you to act as a Tiktok Ads Keywords generator. Your task is to come up with keywords that can...
Skyworks (3360): I want you to act as a Tiktok Ads Keywords generator. Your task is to come up with keywords that can...
Here's a breakdown per subset of RewardBench:
Reward Bench Subset | Count |
---|---|
math-prm | 233 |
hep-rust | 53 |
hep-js | 50 |
hep-java | 50 |
hep-python | 50 |
hep-go | 50 |
hep-cpp | 44 |
refusals-dangerous | 32 |
refusals-offensive | 31 |
llmbar-adver-GPTInst | 12 |
llmbar-adver-neighbor | 12 |
alpacaeval-easy | 11 |
alpacaeval-length | 11 |
alpacaeval-hard | 10 |
llmbar-adver-manual | 5 |
llmbar-natural | 5 |
mt-bench-med | 5 |
mt-bench-hard | 5 |
llmbar-adver-GPT0Out | 2 |
xstest-should-refuse | 2 |
A dataset of all of the overlapping prompts is released here.
A cleaned version of the skyworks preferences dataset (all ngrams of 7+ length removed) is abailable here.
The leaderboard has been updated to include a flag for models that we know trained on this dataset.
The script used to find contamination was created primarily by kernelmachine and heavily modified with the help of Claude.
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm
from datasets import load_dataset, Dataset, concatenate_datasets
import numpy as np
import json
from datetime import datetime
# Load datasets
df = load_dataset("allenai/reward-bench", split="filtered")
dfsw = load_dataset("Skywork/Skywork-Reward-Preference-80K-v0.1", split="train")
# Keep track of contaminated indices for each source
contaminated_indices = {
'magpie_ultra': set(),
'magpie_pro_llama3.1': set(),
'helpsteer2': set(),
'offsetbias': set(),
'wildguard': set(),
'magpie_pro': set(),
}
best_matches = []
for skyworks_source in ['magpie_ultra', 'magpie_pro_llama3.1', 'helpsteer2', 'offsetbias', 'wildguard', 'magpie_pro']:
print(f"\nAnalyzing source: {skyworks_source}")
df1 = dfsw.filter(lambda x: x['source'] == skyworks_source)
reward_bench_prompts = df['prompt']
skyworks_prompts = [ex['chosen'][0]["content"] for ex in df1]
# Vectorize all prompts
print("Vectorizing prompts...")
vectorizer = CountVectorizer(ngram_range=(7,13))
all_prompts = list(reward_bench_prompts) + list(skyworks_prompts)
vectorized = vectorizer.fit_transform(tqdm(all_prompts))
# Split vectorized matrix back into two datasets
n_rb = len(reward_bench_prompts)
rb_vectorized = vectorized[:n_rb]
sw_vectorized = vectorized[n_rb:]
# Calculate similarity matrix
print("Calculating similarities...")
similarity_matrix = (rb_vectorized @ sw_vectorized.T).toarray()
# Find all contaminated indices for this source
for rb_idx in tqdm(range(similarity_matrix.shape[0]), desc="Finding matches"):
matches = np.where(similarity_matrix[rb_idx] > 0)[0]
contaminated_indices[skyworks_source].update(matches)
# Find best matching pairs for each RewardBench prompt
for rb_idx in tqdm(range(similarity_matrix.shape[0]), desc="Finding best matches"):
# Find the best matching Skyworks prompt for this RewardBench prompt
best_sw_idx = np.argmax(similarity_matrix[rb_idx])
best_score = similarity_matrix[rb_idx, best_sw_idx]
# Only include if there is actually an overlap (score > 0)
# Note, there are other prompts with overlap in Skyworks!
if best_score > 0:
rb_prompt = reward_bench_prompts[rb_idx]
sw_prompt = skyworks_prompts[best_sw_idx]
# full skyworks
sw_dict = df1.select([best_sw_idx]).to_dict()
best_matches.append({
'reward_bench_id': df.select([rb_idx])['id'][0],
'skyworks_idx': int(best_sw_idx), # Convert numpy int to regular int for JSON
'reward_bench_prompt': rb_prompt,
'skyworks_prompt': sw_prompt,
'overlap_score': float(best_score), # Convert to float for JSON serialization
# 'reward_bench_full': dict(rb_full),
'skyworks_full': sw_dict,
'skyworks_subset': skyworks_source
})
# Sort matches by overlap score
best_matches.sort(key=lambda x: x['overlap_score'], reverse=True)
# Save as JSON
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
json_filename = f'best_matches_{timestamp}.json'
with open(json_filename, 'w', encoding='utf-8') as f:
json.dump(best_matches, f, ensure_ascii=False, indent=2)
# Print some statistics
print(f"\nAnalysis Summary:")
print(f"Total RewardBench prompts with matches: {len(best_matches)}")
if best_matches:
print(f"Overlap score range: {best_matches[-1]['overlap_score']:.2f} to {best_matches[0]['overlap_score']:.2f}")
# Print top 5 matches as example
print("\nTop 5 matches:")
for i, match in enumerate(best_matches[:5]):
print(f"\nMatch {i+1} (Score: {match['overlap_score']:.2f}):")
print(f"RewardBench ({match['reward_bench_id']}): {match['reward_bench_prompt'][:100]}...")
print(f"Skyworks ({match['skyworks_idx']}): {match['skyworks_prompt'][:100]}...")
print(f"\nSaved best matches to: {json_filename}")
output_data = Dataset.from_list(best_matches)
output_data.push_to_hub("natolambert/skyworks-rewardbench-contamination")
# Create clean datasets for each source
clean_datasets = []
for source in contaminated_indices.keys():
# Get the subset for this source
source_data = dfsw.filter(lambda x: x['source'] == source)
# Create a boolean mask for clean examples
clean_indices = [i for i in range(len(source_data)) if i not in contaminated_indices[source]]
# Filter the dataset
clean_source_data = source_data.select(clean_indices)
clean_datasets.append(clean_source_data)
print(f"\nStats for {source}:")
print(f"Original size: {len(source_data)}")
print(f"Contaminated examples: {len(contaminated_indices[source])}")
print(f"Clean examples: {len(clean_indices)}")
# Combine all clean datasets
clean_dfsw = concatenate_datasets(clean_datasets)
clean_dfsw.push_to_hub("natolambert/skywork-preferences-80k-v0.1-cleaned")