Skip to content

Instantly share code, notes, and snippets.

Created November 14, 2024 23:11
Show Gist options
  • Save ornellaaltunyan/7d1103bfd77e7d65cb7c4ba70c871aa1 to your computer and use it in GitHub Desktop.
Save ornellaaltunyan/7d1103bfd77e7d65cb7c4ba70c871aa1 to your computer and use it in GitHub Desktop.
import fs from "fs";
import path from "path";
import dotenv from "dotenv";
import { marked } from "marked";
import { OpenAI } from "openai";
import { MongoClient } from "mongodb";
interface Section {
title: string;
content: string;
interface DocType {
_id: string;
title: string;
content: string;
embedding: number[];
dotenv.config({ path: ".env.local" });
if (!process.env.BRAINTRUST_API_KEY) {
throw new Error("BRAINTRUST_API_KEY is not set");
if (!process.env.MONGO_URI) {
throw new Error("MONGO_URI is not set");
const openai = new OpenAI({
baseURL: "",
apiKey: process.env.BRAINTRUST_API_KEY,
const client = new MongoClient(process.env.MONGO_URI);
async function run() {
try {
const database = client.db("braintrust-docs");
const collection = database.collection("documents");
// define your Atlas Vector Search index
const index = {
name: "vector_index",
type: "vectorSearch",
definition: {
"fields": [
"type": "vector",
"numDimensions": 1536,
"path": "embedding",
"similarity": "cosine"
// run the helper method
const result = await collection.createSearchIndex(index);
console.log(`New search index named ${result} is building.`);
// wait for the index to be ready to query
console.log("Polling to check if the index is ready. This may take up to a minute.")
let isQueryable = false;
while (!isQueryable) {
const cursor = collection.listSearchIndexes();
for await (const index of cursor) {
if ( === result) {
if (index.queryable) {
console.log(`${result} is ready for querying.`);
isQueryable = true;
} else {
await new Promise(resolve => setTimeout(resolve, 5000));
} finally {
await client.close();
function parseMarkdownFile(filePath: string): Section[] {
const content = fs.readFileSync(filePath, "utf-8");
const tokens = marked.lexer(content);
const sections: Section[] = [];
let currentSection: Section | null = null;
tokens.forEach((token) => {
if (token.type === "heading") {
if (currentSection) {
currentSection = {
title: token.text,
content: "",
} else if (currentSection) {
if (token.type === "paragraph" || token.type === "text") {
currentSection.content += token.text + "\n";
} else if (token.type === "code") {
currentSection.content +=
"```" + token.lang + "\n" + token.text + "\n```\n";
if (currentSection) {
for (let i = 0; i < sections.length - 1; i++) {
if (sections[i].title !== "" && sections[i].content === "") {
sections[i].content =
sections[i + 1].title + "\n" + sections[i + 1].content;
sections[i + 1].title = "";
sections[i + 1].content = "";
return sections.filter((section) => section.content !== "");
function getAllMarkdownFiles(dirPath: string): string[] {
const files: string[] = [];
const entries = fs.readdirSync(dirPath, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dirPath,;
if (entry.isDirectory()) {
} else if (
entry.isFile() &&
(".md") ||".mdx"))
) {
return files;
async function createEmbeddingWithRetry(input, retries = 3, delayMs = 1000) {
for (let i = 0; i < retries; i++) {
try {
return await openai.embeddings.create({ input, model: "text-embedding-3-small" });
} catch (error) {
if (error.response?.status === 429 && i < retries - 1) {
console.log(`Rate limited. Retrying after ${delayMs}ms...`);
await new Promise((resolve) => setTimeout(resolve, delayMs));
} else {
throw error;
async function main() {
await client.connect();
const db = client.db("braintrust-docs");
const collection = db.collection<DocType>("documents");
const docsDir = path.join(__dirname, "docs-sample");
const markdownFiles = getAllMarkdownFiles(docsDir);
const allSections: Section[] = [];
for (const file of markdownFiles) {
const sections = parseMarkdownFile(file);
const upserts = [];
for (let i = 0; i < allSections.length; i += 10) {
const batch = allSections.slice(i, i + 10);
const batchPromises = (item, j) => {
const embeddingResponse = await createEmbeddingWithRetry(
`# ${item.title}\n\n${item.content}`
const embedding =[0].embedding;
return collection.updateOne(
{ _id: `${item.title}-${i * 10 + j}` },
$set: {
title: item.title,
content: item.content,
embedding: embedding,
{ upsert: true }
// Throttle requests by awaiting each batch
await Promise.all(upserts);
console.log(`Uploaded ${allSections.length} documents.`);
.finally(() => client.close());
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment