Skip to content

Instantly share code, notes, and snippets.

@cdxker
Last active April 1, 2024 20:44
Show Gist options
  • Save cdxker/90af4ae4d4873cc2d6acc7586c3c9705 to your computer and use it in GitHub Desktop.
Save cdxker/90af4ae4d4873cc2d6acc7586c3c9705 to your computer and use it in GitHub Desktop.
Supa fast bulk_create script with https://trieve.ai and bun.js (unsupported)
API_URL="http://api.trieve.ai/api"
API_KEY="tr-***************"
ORGANIZATION_ID="************************************"
# Optional
DATASET_ID="*************" # If doesn't exist, will make one from the organization ID
# If QDRANT information doesn't exist, it just uses the defaults in trieve
QDRANT_URL="https://<my-qdrant-ip>:6334"
QDRANT_API_KEY="my-qdrant-api-key"
QDRANT_COLLECTION_NAME="my-collection"
const API_URL = Bun.env.API_URL as string;
const QDRANT_URL = Bun.env.QDRANT_URL ?? null;
const QDRANT_API_KEY = Bun.env.QDRANT_API_KEY ?? null;
const QDRANT_COLLECTION_NAME = Bun.env.QDRANT_COLLECTION_NAME ?? null;
export interface CreateChunkData {
chunk_html: string;
group_ids?: string[];
link: string;
tag_set?: string[];
tracking_id?: string;
upsert_by_tracking_id?: boolean;
metadata: object;
}
const createChunk = async (chunkData: CreateChunkData[]) => {
console.time("uplaod");
const response = await fetch(`${API_URL}/chunk`, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: Bun.env.API_KEY ?? "",
"TR-Dataset": DATASET_ID,
},
body: JSON.stringify(chunkData),
});
if (!response.ok) {
console.error("err", response.status, response.statusText);
const respText = await response.text();
console.error("err", respText);
return "";
}
const responseJson = await response.json();
if (!response.ok) {
console.error("err", responseJson.message);
return "";
}
console.log("succ", responseJson.chunk_metadata);
console.timeEnd("uplaod");
};
const createDataset = async () => {
const response = await fetch(`${API_URL}/dataset`, {
headers: {
"Content-Type": "application/json",
"TR-Organization": Bun.env.ORGANIZATION_ID ?? "",
Authorization: Bun.env.API_KEY ?? "",
},
body: JSON.stringify({
dataset_name: "TestDataset",
organization_id: Bun.env.ORGANIZATION_ID ?? "",
server_configuration: {
LLM_BASE_URL: "",
LLM_DEFAULT_MODEL: "",
RAG_PROMPT: "",
EMBEDDING_SIZE: 1024,
N_RETRIEVALS_TO_INCLUDE: 8,
DUPLICATE_DISTANCE_THRESHOLD: 1.1,
DOCUMENT_UPLOAD_FEATURE: true,
DOCUMENT_DOWNLOAD_FEATURE: true,
COLLISIONS_ENABLED: false,
QDRANT_URL,
QDRANT_API_KEY,
QDRANT_COLLECTION_NAME,
},
client_configuration: "{}",
}),
method: "POST",
});
console.log(response);
const responseJson = await response.json();
return responseJson.id;
};
const DATASET_ID = Bun.env.DATASET_ID ?? (await createDataset());
const prom = [];
for (let j = 0; j < 10; j++) {
const datas = [];
for (let i = 0; i < 1000; i++) {
const chunk_html = `ID ${i} ${j} Seems they’ve built out a PG version of MySQL’s Vitess Query rewriting seems interesting, having a layer between your DB and your application would also allow various ACL stuff as wellr`;
const chunkData: CreateChunkData = {
chunk_html,
link: "",
tracking_id: `${i}${j}`,
metadata: {
bro: "hi",
},
};
datas.push(chunkData);
}
prom.push(createChunk(datas));
}
await Promise.all(prom);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment