Skip to content

Instantly share code, notes, and snippets.

@elct9620
Created February 11, 2024 14:35
Show Gist options
  • Save elct9620/5e2921e32c97c8bca4c3b6d11e5e7a80 to your computer and use it in GitHub Desktop.
Save elct9620/5e2921e32c97c8bca4c3b6d11e5e7a80 to your computer and use it in GitHub Desktop.
Hugo to Embedding Example

Hugo to Embedding Example

Convert Hugo articles to Embendding for Cloudflare Vectorize Store

Add JSON output to Hugo

The single.json is a example to add JSON outptu for your theme.

Setup a new config ai.toml to extend output

[outputs]
  # Enable HTML to support Permalink to original content
  page = ["HTML", "JSON"]
hugo --config config.toml,ai.toml

Run converter

Download embedding.rb and run it with environment config

OPENAI_ACCESS_TOKEN=xxx CONCURRENCY=10 SOURCE=public/posts/**/index.json OUTPUT=embenddings.ndjson ruby embedding.rb

Insert into Vectorize Store

npx wrangler vectorize insert [INDEX_NAME] --file embeddings.ndjson
# frozen_string_literal: true
require 'bundler/inline'
require 'json'
gemfile do
source 'https://rubygems.org'
gem 'async'
gem 'async-http-faraday'
gem 'baran'
gem 'ruby-openai'
end
require 'async'
require 'async/barrier'
require 'async/semaphore'
require 'async/http/faraday'
require 'async/http/faraday/default'
# Convert post to chunks
class ChunkService
attr_reader :splitter
def initialize(splitter:)
@splitter = splitter
end
def call(paths:)
Dir[paths]
.map { |path| JSON.parse(File.read(path)) }
.flat_map { |post| splitter.chunks(post.delete('content'), metadata: post) }
end
end
# Get embeddings for each chunk
class PostEmbeddingService
attr_reader :openai, :model
def initialize(openai:, model: 'text-embedding-3-small')
@openai = openai
@model = model
end
def call(post:)
post[:id] = [post[:metadata].delete('id'), post.delete(:cursor)].compact.join('.')
post[:values] = vector_of(text: post[:text])
post[:metadata][:text] = post.delete(:text)
post
rescue StandardError => e
puts "Unable to process post: #{post[:id]}: #{e.message}"
nil
end
private
def vector_of(text:)
openai.embeddings(
parameters: {
model: model,
input: text
}
).dig('data', 0, 'embedding')
end
end
# Convert post to embeddings for Cloudflare Vectorize Store
class EmbeddingUseCase
attr_reader :chunk_service, :embedding_service
def initialize(chunk_service:, embedding_service:)
@chunk_service = chunk_service
@embedding_service = embedding_service
end
def execute(paths:, dest:, concurrency: 5)
output = File.open(dest, 'w')
posts = chunk_service.call(paths: paths)
async(concurrency: concurrency) do |semaphore|
posts.each do |post|
semaphore.async do
append(post: post, to: output)
end
end
end
output.close
end
private
def async(concurrency: 5)
barrier = Async::Barrier.new
semaphore = Async::Semaphore.new(concurrency, parent: barrier)
Async do
yield semaphore
barrier.wait
ensure
barrier.stop
end
end
def append(post:, to:)
puts "Processing #{post[:metadata]['published_at']} - #{post[:metadata]['title']} @ #{post[:cursor]}"
post = embedding_service.call(post: post)
to.puts(JSON.dump(post)) if post
end
end
openai_access_token = ENV['OPENAI_ACCESS_TOKEN']
raise ArgumentError, 'OpenAI access token is required' unless openai_access_token
model = ENV.fetch('OPENAI_MODEL', 'text-embedding-3-small')
concurrency = ENV.fetch('CONCURRENCY', 5).to_i
source = ENV.fetch('SOURCE', 'posts/**/*.json')
destination = ENV.fetch('OUTPUT', 'output.ndjson')
client = OpenAI::Client.new(access_token: openai_access_token)
splitter = Baran::MarkdownSplitter.new
chunk_service = ChunkService.new(splitter: splitter)
embedding_service = PostEmbeddingService.new(openai: client, model: model)
use_case = EmbeddingUseCase.new(
chunk_service: chunk_service,
embedding_service: embedding_service
)
use_case.execute(paths: source,
dest: destination,
concurrency: concurrency)
{
"id": "{{ .File.UniqueID }}",
"slug": "{{ .Slug }}",
"title": "{{ .Title }}",
"tags": {{ .Params.tags | jsonify }},
"published_at": {{ .Date | jsonify }},
"permalink": "{{ with .OutputFormats.Get "html" }}{{ .Permalink }}{{ end }}",
"language": "{{ .Lang }}",
"content": {{ .RawContent | jsonify }}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment