Created
November 14, 2024 23:11
-
-
Save ornellaaltunyan/7d1103bfd77e7d65cb7c4ba70c871aa1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fs from "fs"; | |
import path from "path"; | |
import dotenv from "dotenv"; | |
import { marked } from "marked"; | |
import { OpenAI } from "openai"; | |
import { MongoClient } from "mongodb"; | |
interface Section { | |
title: string; | |
content: string; | |
} | |
interface DocType { | |
_id: string; | |
title: string; | |
content: string; | |
embedding: number[]; | |
} | |
dotenv.config({ path: ".env.local" }); | |
if (!process.env.BRAINTRUST_API_KEY) { | |
throw new Error("BRAINTRUST_API_KEY is not set"); | |
} | |
if (!process.env.MONGO_URI) { | |
throw new Error("MONGO_URI is not set"); | |
} | |
const openai = new OpenAI({ | |
baseURL: "https://api.braintrust.dev/v1/proxy", | |
apiKey: process.env.BRAINTRUST_API_KEY, | |
}); | |
const client = new MongoClient(process.env.MONGO_URI); | |
async function run() { | |
try { | |
const database = client.db("braintrust-docs"); | |
const collection = database.collection("documents"); | |
// define your Atlas Vector Search index | |
const index = { | |
name: "vector_index", | |
type: "vectorSearch", | |
definition: { | |
"fields": [ | |
{ | |
"type": "vector", | |
"numDimensions": 1536, | |
"path": "embedding", | |
"similarity": "cosine" | |
} | |
] | |
} | |
} | |
// run the helper method | |
const result = await collection.createSearchIndex(index); | |
console.log(`New search index named ${result} is building.`); | |
// wait for the index to be ready to query | |
console.log("Polling to check if the index is ready. This may take up to a minute.") | |
let isQueryable = false; | |
while (!isQueryable) { | |
const cursor = collection.listSearchIndexes(); | |
for await (const index of cursor) { | |
if (index.name === result) { | |
if (index.queryable) { | |
console.log(`${result} is ready for querying.`); | |
isQueryable = true; | |
} else { | |
await new Promise(resolve => setTimeout(resolve, 5000)); | |
} | |
} | |
} | |
} | |
} finally { | |
await client.close(); | |
} | |
} | |
function parseMarkdownFile(filePath: string): Section[] { | |
const content = fs.readFileSync(filePath, "utf-8"); | |
const tokens = marked.lexer(content); | |
const sections: Section[] = []; | |
let currentSection: Section | null = null; | |
tokens.forEach((token) => { | |
if (token.type === "heading") { | |
if (currentSection) { | |
sections.push(currentSection); | |
} | |
currentSection = { | |
title: token.text, | |
content: "", | |
}; | |
} else if (currentSection) { | |
if (token.type === "paragraph" || token.type === "text") { | |
currentSection.content += token.text + "\n"; | |
} else if (token.type === "code") { | |
currentSection.content += | |
"```" + token.lang + "\n" + token.text + "\n```\n"; | |
} | |
} | |
}); | |
if (currentSection) { | |
sections.push(currentSection); | |
} | |
for (let i = 0; i < sections.length - 1; i++) { | |
if (sections[i].title !== "" && sections[i].content === "") { | |
sections[i].content = | |
sections[i + 1].title + "\n" + sections[i + 1].content; | |
sections[i + 1].title = ""; | |
sections[i + 1].content = ""; | |
} | |
} | |
return sections.filter((section) => section.content !== ""); | |
} | |
function getAllMarkdownFiles(dirPath: string): string[] { | |
const files: string[] = []; | |
const entries = fs.readdirSync(dirPath, { withFileTypes: true }); | |
for (const entry of entries) { | |
const fullPath = path.join(dirPath, entry.name); | |
if (entry.isDirectory()) { | |
files.push(...getAllMarkdownFiles(fullPath)); | |
} else if ( | |
entry.isFile() && | |
(entry.name.endsWith(".md") || entry.name.endsWith(".mdx")) | |
) { | |
files.push(fullPath); | |
} | |
} | |
return files; | |
} | |
async function createEmbeddingWithRetry(input, retries = 3, delayMs = 1000) { | |
for (let i = 0; i < retries; i++) { | |
try { | |
return await openai.embeddings.create({ input, model: "text-embedding-3-small" }); | |
} catch (error) { | |
if (error.response?.status === 429 && i < retries - 1) { | |
console.log(`Rate limited. Retrying after ${delayMs}ms...`); | |
await new Promise((resolve) => setTimeout(resolve, delayMs)); | |
} else { | |
throw error; | |
} | |
} | |
} | |
} | |
async function main() { | |
await client.connect(); | |
const db = client.db("braintrust-docs"); | |
const collection = db.collection<DocType>("documents"); | |
const docsDir = path.join(__dirname, "docs-sample"); | |
const markdownFiles = getAllMarkdownFiles(docsDir); | |
const allSections: Section[] = []; | |
for (const file of markdownFiles) { | |
const sections = parseMarkdownFile(file); | |
allSections.push(...sections); | |
} | |
const upserts = []; | |
for (let i = 0; i < allSections.length; i += 10) { | |
const batch = allSections.slice(i, i + 10); | |
const batchPromises = batch.map(async (item, j) => { | |
const embeddingResponse = await createEmbeddingWithRetry( | |
`# ${item.title}\n\n${item.content}` | |
); | |
const embedding = embeddingResponse.data[0].embedding; | |
return collection.updateOne( | |
{ _id: `${item.title}-${i * 10 + j}` }, | |
{ | |
$set: { | |
title: item.title, | |
content: item.content, | |
embedding: embedding, | |
}, | |
}, | |
{ upsert: true } | |
); | |
}); | |
upserts.push(...batchPromises); | |
// Throttle requests by awaiting each batch | |
await Promise.all(upserts); | |
} | |
console.log(`Uploaded ${allSections.length} documents.`); | |
run().catch(console.dir); | |
} | |
main() | |
.catch(console.error) | |
.finally(() => client.close()); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment