Last active
April 18, 2024 02:03
-
-
Save eurica/ac9438313a6f8a4b34860f1a538625fc to your computer and use it in GitHub Desktop.
Quick one off to compare LLMs handling tweets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// WARNING: this code is 80% machine generated with Github Copilot | |
// I've fixed the bugs I've found, but only refactoring in the rare cases I literally can't follow the code. | |
// This is a bad codebase. | |
require("dotenv").config(); | |
const { HeliconeProxyOpenAI} = require("@helicone/helicone"); | |
const { Anthropic } = require('@anthropic-ai/sdk'); | |
const OpenAI = HeliconeProxyOpenAI; | |
const fs = require("fs"); | |
const headers = { | |
"Helicone-Auth": "Bearer " + process.env["HELICONE_API_KEY"], | |
"Helicone-Cache-Enabled": "true", //default caching limit is 7 days | |
"Helicone-Retry-Enabled": "false", //https://docs.helicone.ai/features/advanced-usage/retries | |
"helicone-retry-num": "1", | |
"helicone-retry-factor": "10", | |
"Helicone-RateLimit-Policy": "1000;w=60;", // Let's start with 1000/minute for now | |
}; | |
const openAiClient = new OpenAI({ | |
apiKey: process.env["OPENAI_API_KEY"], | |
baseURL: "https://oai.hconeai.com/v1", | |
defaultHeaders: headers, | |
});; | |
const anthropic = new Anthropic({ | |
baseURL: "https://anthropic.hconeai.com/", | |
apiKey: process.env['ANTHROPIC_API_KEY'], | |
defaultHeaders: headers, | |
}); | |
function loadJsonFile(filePath) { | |
const jsonData = fs.readFileSync(filePath, "utf-8"); | |
const jsonObject = JSON.parse(jsonData); | |
return jsonObject; | |
} | |
const tests = loadJsonFile("./tests.json"); | |
const tweets = tests.tweets; | |
const criteria = tests.criteria; | |
const scores = tests.scores; | |
async function testTweetGPT(tweet, criteria, model = "gpt-4-0125-preview") { | |
const systemMessage = | |
"You will be given a tweet, return the probability that it is about " + | |
criteria + | |
". Only say a percentage between 0 and 100."; | |
var response = await openAiClient.chat.completions.create({ | |
messages: [ | |
{ | |
role: "system", | |
content: systemMessage, | |
}, | |
{ | |
role: "user", | |
content: [ | |
{ | |
type: "text", | |
text: tweet, | |
}, | |
], | |
}, | |
], | |
model: model, | |
temperature: 0, | |
max_tokens: 256, | |
top_p: 1, | |
frequency_penalty: 0, | |
presence_penalty: 0, | |
stop: ["%"], | |
}); | |
return percentageToNumber(response.choices[0].message.content); | |
} | |
async function testTweetClaude(tweet, criteria, model = 'claude-3-opus-20240229') { | |
const message = await anthropic.messages.create({ | |
max_tokens: 64, | |
messages: [{ role: 'user', content: 'What is the probability that this tweet is about '+criteria+'? "'+tweet+'" . Only say a percentage between 0 and 100. If unsure or I make a mistake say 0%.'}], | |
model: model, | |
temperature: 0, | |
top_k: 1, | |
stop_sequences: ['%'], | |
}); | |
return percentageToNumber(message.content[0].text); | |
} | |
async function main() { | |
console.log(await testTweetClaude("I hate bugs", "bugs")); | |
} | |
//main(); | |
function percentageToNumber(str) { | |
try { | |
return parseInt(str.match(/\d+/)[0]); | |
} catch { | |
console.log("Doesn't parse as number: " + str); | |
return 0; | |
} | |
} | |
const os = require("os"); | |
const csvFileName = "output.csv"; | |
const csvStream = fs.createWriteStream(csvFileName, { flags: "w" }); | |
function writeToCSV(data) { | |
const line = data | |
.map((item) => `"${item.toString().replace(/"/g, '""')}"`) | |
.join(","); | |
csvStream.write(line + os.EOL); | |
} | |
const LIMIT = 2000; | |
async function compare() { | |
const systemMessage = | |
"You will be given a tweet, return the probability that it is about " + | |
criteria + | |
". Only say a percentage between 0 and 100. If unsure or I make a mistake say 0%."; | |
var row = ["Tweet"]; | |
for (let c = 0; c < criteria.length && c < LIMIT; c++) { | |
row.push(criteria[c] + " - Training set"); | |
row.push(criteria[c] + " - GPT4"); | |
row.push(criteria[c] + " - GPT4 (error)"); | |
row.push(criteria[c] + " - GPT3"); | |
row.push(criteria[c] + " - GPT3 (error)"); | |
row.push(criteria[c] + " - ClaudeHaiku"); | |
row.push(criteria[c] + " - ClaudeHaiku (error)"); | |
row.push(criteria[c] + " - ClaudeOpus"); | |
row.push(criteria[c] + " - ClaudeOpus (error)"); | |
} | |
row.push("Tweet"); | |
writeToCSV(row); | |
// iterate over tweets | |
for (let t = 0; t < tweets.length && t < LIMIT; t++) { | |
const tweet = tweets[t]; | |
var row = [t+1]; | |
console.log("Tweet: " + tweet); | |
// iterate over criteria | |
for (let c = 0; c < criteria.length && c < LIMIT; c++) { | |
const criterion = criteria[c]; | |
const score = scores[t][c] * 100; | |
const GPT4score = await testTweetGPT( | |
tweet, | |
criterion, | |
"gpt-4-0125-preview" | |
); | |
const GPT3score = await testTweetGPT( | |
tweet, | |
criterion, | |
"gpt-3.5-turbo-0125" | |
); | |
row.push(Math.abs(score)); | |
row.push(Math.abs(GPT4score)); | |
row.push(Math.abs(score-GPT4score)); | |
row.push(Math.abs(GPT3score)); | |
row.push(Math.abs(score-GPT3score)); | |
row.push(Math.abs(await testTweetClaude(tweet,criterion,"claude-3-haiku-20240307"))); | |
row.push(Math.abs(score-await testTweetClaude(tweet,criterion,"claude-3-haiku-20240307"))); | |
row.push(Math.abs(await testTweetClaude(tweet,criterion,"claude-3-opus-20240229"))); | |
row.push(Math.abs(score-await testTweetClaude(tweet,criterion,"claude-3-opus-20240229"))); | |
} | |
row.push(tweet); | |
console.log(row) | |
writeToCSV(row); | |
} | |
} | |
compare(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"tweets": [ | |
"Changed my handle on Post. Another benefit of arriving there early - the handle you want is still available! Find me at https://post.news/clarissac", | |
"Ducks and Wildcats get it underway with a 5:05 p.m. first pitch at PK tonight. #GoDucks Tickets https://bit.ly/3xtWtQB Watch https://bit.ly/2BBNm2W Listen https://bit.ly/3ZeEsBZ Stats https://bit.ly/3lJtki1", | |
"An executive branch already combatting a public perception about POTUS’ ability to work should know better, whether a real issue or not.", | |
"Trump confirms he’ll be in DC for immunity arguments at the federal appeals court on Tuesday.", | |
"recently signed a job offer im so excited about leaving this founder life for the rest of yall", | |
"The current age and party profile of Oregon registered voters.", | |
"The generative AI question: build, buy, or both? Discover the advantages of aligning AI initiatives with your business objectives: https://msft.it/6012cW2ew", | |
"Jaw Dropping Graphics & An Amazing Story The No.1 Quest Game of 2024 Play your way through a visually-stunning fantasy RPG with 500+ Champions", | |
"This photo is taken out of context. The video you posted in the comments shows that you are a liar.", | |
"@JohnBoozman @sentomcotton @RepWesterman @rep_stevewomack @RepFrenchHill @RepRickCrawford Recent reports show the GOPs investigation into President Biden and Hunter Biden ARE influenced by Russian propaganda. This undermines our democratic values and national integrity. I", | |
"Why are you more concerned with securing Putin's border than America's? Fund Ukraine NOW!", | |
"Calling all #developers! Innovate with our real-time and historical data on the X API. Get started with Pro", | |
"My wife saw bumper sticker that said 'Off to work I go, to pay my neighbor's welfare'. I want the same sticker, but with '(positive connotation)' added on", | |
"An executive branch already combatting a public perception about POTUS’ ability to work should know better, whether a real issue or not.", | |
"Rishi Sunak targets 'arrogant' Starmer at Tory local election launch", | |
"" | |
], | |
"criteria": ["sports", "politics", "american politics", "ads", "rudeness"], | |
"scores": [ | |
[0,0,0,0,0], | |
[1,0,0,1,0], | |
[0,1,1,0,0], | |
[0,1,1,0,0], | |
[0,0,0,0,0], | |
[0,1,1,0,0], | |
[0,0,0,1,0], | |
[0,0,0,1,0], | |
[0,0,0,0,1], | |
[0,1,1,0,0], | |
[0,1,1,0,0], | |
[0,0,0,1,0], | |
[0,0,0,0,0], | |
[0,1,1,0,0], | |
[0,1,0,0,0], | |
[0,1,0,0,0] | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment