Skip to content

Instantly share code, notes, and snippets.

@lankz
Created May 11, 2023 15:06
Show Gist options
  • Save lankz/ca3c9fbe83d9c915880facd861793c02 to your computer and use it in GitHub Desktop.
Save lankz/ca3c9fbe83d9c915880facd861793c02 to your computer and use it in GitHub Desktop.
require 'dotenv'
require 'ruby/openai'
require 'csv'
require 'parallel'
require 'tmpdir'
require 'ruby-progressbar'
Dotenv.load()
training_data = Dir.glob("training-data/**/*.txt")
progress = ProgressBar.create(
total: training_data.length,
format: "%a %e %P% Processed: %c from %C")
csv_files = []
Parallel.each(training_data.each_slice(100).to_a, in_threads: 8) do |files|
openai = OpenAI::Client.new(access_token: ENV['OPENAI_API_KEY'])
# create a temporary directory into which we'll write the csv file
# note that we don't use the block form of Dir.mktmpdir because
# we need to access the directory after the block has finished
# executing
temp_dir = Dir.mktmpdir
csv_filename = "#{temp_dir}/embeddings.csv"
CSV.open(csv_filename, "w") do |csv|
files.each do |file|
progress.increment
text = File.read(file).dump
response = openai.embeddings(
parameters: {
model: "text-embedding-ada-002",
input: text
})
csv << [response['data'][0]['embedding'], text]
end
end
csv_files << csv_filename
end
# now we have a bunch of csv files in the csv_files array
# we need to combine them into a single csv file
CSV.open("embeddings.csv", "w") do |csv|
csv << [:embedding, :text]
csv_files.each do |csv_file|
CSV.foreach(csv_file) do |row|
csv << row
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment