Skip to content

Instantly share code, notes, and snippets.

@owulveryck
Last active June 18, 2024 15:29
Show Gist options
  • Save owulveryck/dcf3de4e0ad82ab99bf116828112eacd to your computer and use it in GitHub Desktop.
Save owulveryck/dcf3de4e0ad82ab99bf116828112eacd to your computer and use it in GitHub Desktop.
import pandas as pd
import ollama
import chromadb
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
# Load Parquet File
url = "https://blog.owulveryck.info/assets/sampledata/wardley_book/wardleyBook.parquet"
df = pd.read_parquet(url)
# Ensure all content is decoded into strings
def decode_content(content):
if isinstance(content, bytes):
return content.decode('utf-8')
return content
# Apply decode to each row in the 'content' column
df['content'] = df['content'].apply(decode_content)
documents = df['content'].tolist() # Adjust 'content' if the actual column name differs
client = chromadb.PersistentClient(
path="db.chroma",
settings=Settings(),
tenant=DEFAULT_TENANT,
database=DEFAULT_DATABASE,
)
collection = client.create_collection(name="wardley_content_embeddings")
# Store each document in a vector embedding database
for i, d in enumerate(documents):
response = ollama.embeddings(model="mxbai-embed-large", prompt=d)
embedding = response["embedding"]
collection.add(
ids=[str(i)],
embeddings=[embedding],
documents=[d]
)
print("Embeddings stored successfully in ChromaDB.")
package main
import (
"fmt"
"log"
"os"
"github.com/xitongsys/parquet-go-source/local"
"github.com/xitongsys/parquet-go/parquet"
"github.com/xitongsys/parquet-go/writer"
)
type Figure struct {
FigureID int32 `parquet:"name=figure_id, type=INT32"`
Format string `parquet:"name=format, type=BYTE_ARRAY"`
Content string `parquet:"name=content, type=BYTE_ARRAY"`
}
func main() {
var err error
fw, err := local.NewLocalFileWriter("figures.parquet")
if err != nil {
log.Fatal("Can't create local file", err)
}
// write
pw, err := writer.NewParquetWriter(fw, new(Figure), 4)
if err != nil {
log.Fatal("Can't create parquet writer", err)
}
pw.RowGroupSize = 128 * 1024 * 1024 // 128M
pw.PageSize = 8 * 1024 // 8K
pw.CompressionType = parquet.CompressionCodec_SNAPPY
num := 250
for i := 0; i < num; i++ {
c, err := os.ReadFile(fmt.Sprintf("Figure %v.jpeg", num))
if err != nil {
log.Fatal(err)
}
fig := Figure{
FigureID: int32(i),
Format: "jpeg",
Content: string(c),
}
if err = pw.Write(fig); err != nil {
log.Fatal("Write error", err)
}
_ = c
}
if err = pw.WriteStop(); err != nil {
log.Fatal("WriteStop error", err)
}
log.Println("Write Finished")
fw.Close()
}
import numpy as np
import ollama
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings
from chromadb import PersistentClient
# Initialize ChromaDB client
client = PersistentClient(
path="db.chroma",
settings=Settings(),
tenant=DEFAULT_TENANT,
database=DEFAULT_DATABASE,
)
# Define the prompt
prompt = "what are the sources of inertia?"
# Get embeddings from the Ollama service
response = ollama.embeddings(model="mxbai-embed-large", prompt=prompt)
embedding = response["embedding"] # Already a list, do not need np.array
collection = client.get_collection("wardley_content_embeddings")
# Function to query ChromaDB and return the most similar elements
results = collection.query(
query_embeddings=[response["embedding"]],
n_results=1
)
data = results['documents'][0][0]
# generate a response combining the prompt and data we retrieved in step 2
output = ollama.generate(
model="llama3",
prompt=f"Using this data: {data}. Respond to this prompt: {prompt}"
)
print(output['response'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment