-
-
Save owulveryck/dcf3de4e0ad82ab99bf116828112eacd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import ollama | |
import chromadb | |
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings | |
# Load Parquet File | |
url = "https://blog.owulveryck.info/assets/sampledata/wardley_book/wardleyBook.parquet" | |
df = pd.read_parquet(url) | |
# Ensure all content is decoded into strings | |
def decode_content(content): | |
if isinstance(content, bytes): | |
return content.decode('utf-8') | |
return content | |
# Apply decode to each row in the 'content' column | |
df['content'] = df['content'].apply(decode_content) | |
documents = df['content'].tolist() # Adjust 'content' if the actual column name differs | |
client = chromadb.PersistentClient( | |
path="db.chroma", | |
settings=Settings(), | |
tenant=DEFAULT_TENANT, | |
database=DEFAULT_DATABASE, | |
) | |
collection = client.create_collection(name="wardley_content_embeddings") | |
# Store each document in a vector embedding database | |
for i, d in enumerate(documents): | |
response = ollama.embeddings(model="mxbai-embed-large", prompt=d) | |
embedding = response["embedding"] | |
collection.add( | |
ids=[str(i)], | |
embeddings=[embedding], | |
documents=[d] | |
) | |
print("Embeddings stored successfully in ChromaDB.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"fmt" | |
"log" | |
"os" | |
"github.com/xitongsys/parquet-go-source/local" | |
"github.com/xitongsys/parquet-go/parquet" | |
"github.com/xitongsys/parquet-go/writer" | |
) | |
type Figure struct { | |
FigureID int32 `parquet:"name=figure_id, type=INT32"` | |
Format string `parquet:"name=format, type=BYTE_ARRAY"` | |
Content string `parquet:"name=content, type=BYTE_ARRAY"` | |
} | |
func main() { | |
var err error | |
fw, err := local.NewLocalFileWriter("figures.parquet") | |
if err != nil { | |
log.Fatal("Can't create local file", err) | |
} | |
// write | |
pw, err := writer.NewParquetWriter(fw, new(Figure), 4) | |
if err != nil { | |
log.Fatal("Can't create parquet writer", err) | |
} | |
pw.RowGroupSize = 128 * 1024 * 1024 // 128M | |
pw.PageSize = 8 * 1024 // 8K | |
pw.CompressionType = parquet.CompressionCodec_SNAPPY | |
num := 250 | |
for i := 0; i < num; i++ { | |
c, err := os.ReadFile(fmt.Sprintf("Figure %v.jpeg", num)) | |
if err != nil { | |
log.Fatal(err) | |
} | |
fig := Figure{ | |
FigureID: int32(i), | |
Format: "jpeg", | |
Content: string(c), | |
} | |
if err = pw.Write(fig); err != nil { | |
log.Fatal("Write error", err) | |
} | |
_ = c | |
} | |
if err = pw.WriteStop(); err != nil { | |
log.Fatal("WriteStop error", err) | |
} | |
log.Println("Write Finished") | |
fw.Close() | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import ollama | |
from chromadb.config import DEFAULT_TENANT, DEFAULT_DATABASE, Settings | |
from chromadb import PersistentClient | |
# Initialize ChromaDB client | |
client = PersistentClient( | |
path="db.chroma", | |
settings=Settings(), | |
tenant=DEFAULT_TENANT, | |
database=DEFAULT_DATABASE, | |
) | |
# Define the prompt | |
prompt = "what are the sources of inertia?" | |
# Get embeddings from the Ollama service | |
response = ollama.embeddings(model="mxbai-embed-large", prompt=prompt) | |
embedding = response["embedding"] # Already a list, do not need np.array | |
collection = client.get_collection("wardley_content_embeddings") | |
# Function to query ChromaDB and return the most similar elements | |
results = collection.query( | |
query_embeddings=[response["embedding"]], | |
n_results=1 | |
) | |
data = results['documents'][0][0] | |
# generate a response combining the prompt and data we retrieved in step 2 | |
output = ollama.generate( | |
model="llama3", | |
prompt=f"Using this data: {data}. Respond to this prompt: {prompt}" | |
) | |
print(output['response']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment