Skip to content

Instantly share code, notes, and snippets.

@densumesh
Last active May 15, 2024 23:07
Show Gist options
  • Save densumesh/31cc0ce3e54316a32dcddbdf48c2f11f to your computer and use it in GitHub Desktop.
Save densumesh/31cc0ce3e54316a32dcddbdf48c2f11f to your computer and use it in GitHub Desktop.
import * as fs from "fs";
import * as readline from "readline";
import * as path from "path";
import {
ChunkApi,
Configuration,
CreateChunkData,
} from "@devflowinc/trieve-js-ts-client";
import { it } from "node:test";
interface LanguageTaggedValue {
language_tag: string;
value: string;
}
interface ItemDimension {
unit: string;
value: number;
}
interface ItemDimensions {
height: ItemDimension;
length: ItemDimension;
width: ItemDimension;
}
interface Node {
node_id: number;
node_name: string;
}
interface Item {
brand: LanguageTaggedValue[];
bullet_point: LanguageTaggedValue[];
color: LanguageTaggedValue[];
item_id: string;
price?: number;
image_url?: string;
item_name: LanguageTaggedValue[];
model_name: LanguageTaggedValue[];
model_number: { value: string }[];
model_year: { value: number }[];
product_type: { value: string }[];
style: LanguageTaggedValue[];
main_image_id: string;
other_image_id: string[];
item_keywords: LanguageTaggedValue[];
country: string;
marketplace: string;
domain_name: string;
node: Node[];
item_dimensions?: ItemDimensions;
}
function itemToSearchableString(item: Item, price: number | undefined): string {
let searchableString = "";
// Safely adds a field to the searchable string if it exists
const addField = (
field: string | undefined,
prefix: string = "",
postfix: string = "\n",
) => {
if (field) {
searchableString += `${prefix}${field}${postfix}`;
}
};
addField(price?.toString(), "Price: $");
// Process each field with a safe check and appropriate formatting
addField(item.brand?.[0]?.value, "Brand: ");
addField(item.item_name?.[0]?.value, "Product Name: ");
item.bullet_point?.forEach((bp) => addField(bp.value, "", ";"));
addField(item.color?.[0]?.value, "Color: ");
addField(item.model_name?.[0]?.value, "Model Name: ");
addField(item.model_number?.[0]?.value, "Model Number: ");
// For numerical fields, ensure existence before converting to string
if (item.model_year?.[0]?.value !== undefined) {
addField(item.model_year[0].value.toString(), "Model Year: ");
}
addField(item.product_type?.[0]?.value, "Product Type: ");
addField(item.style?.[0]?.value, "Style: ");
item.item_keywords?.forEach((kw) => addField(kw.value, "", ";"));
addField(item.country, "Country: ");
addField(item.marketplace, "Marketplace: ");
addField(item.domain_name, "Domain: ");
return searchableString.trim();
}
const trieveApiKey = Bun.env.TRIEVE_API_KEY ?? "";
const trieveDatasetId = Bun.env.TRIEVE_DATASET_ID ?? "";
const trieveApiConfig = new Configuration({
apiKey: trieveApiKey,
basePath: "https://api.trieve.ai",
});
function parseItem(jsonString: string): Item {
const item: Item = JSON.parse(jsonString);
return item;
}
function extractMetadata(
item: Item,
image_url: string,
price: number | undefined,
): any {
const metadata: Partial<Item> = { ...item };
metadata.image_url = image_url;
metadata.price = price;
return metadata;
}
const chunkApi = new ChunkApi(trieveApiConfig);
// Function to read and parse each line of the JSON file
async function parseItemsFromFile(filePath: string) {
const fileStream = fs.createReadStream(filePath);
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity,
});
const items: CreateChunkData = [];
for await (const line of rl) {
try {
const item: Item = JSON.parse(line);
let image_url;
let imageId = item.main_image_id == "" ? null : item.main_image_id;
let imagePath = imageHashMap.get(imageId);
let price = [10, 25, 50, 100, 500, 1000][Math.floor(Math.random() * 6)];
if (imagePath != null) {
image_url = `https://amazon-berkeley-objects.s3.amazonaws.com/images/small/${imagePath}`;
}
items.push({
chunk_html: itemToSearchableString(item, price),
link: `https://${item.domain_name}/dp/${item.item_id}`,
tracking_id: item.item_id,
tag_set: item.item_keywords?.map((kw) => kw.value),
metadata: extractMetadata(item, image_url, price),
upsert_by_tracking_id: true,
});
} catch (error) {
console.error("Error parsing JSON from line:", error);
}
}
const chunkSize = 50;
const chunkedItems: CreateChunkData[] = [];
for (let i = 0; i < items.length; i += chunkSize) {
const chunk = items.slice(i, i + chunkSize);
chunkedItems.push(chunk);
}
for (const chunk of chunkedItems) {
try {
console.log(`Creating chunk`);
await chunkApi.createChunk(trieveDatasetId, chunk);
} catch (error) {
console.error(`Failed to create chunk`);
console.error(error);
}
}
return items;
}
// Function to parse CSV data and store it in a hashmap
function parseCSV(csvData: string): Map<any, any> {
const lines = csvData.split("\n");
const header = lines[0].trim().split(",");
const hashmap = new Map();
for (let i = 1; i < lines.length; i++) {
const values = lines[i].trim().split(",");
let imageId: string | null = null;
let path: string | null = null;
if (values[0]) {
imageId = values[0].trim();
}
if (values[3]) {
path = values[3].trim();
}
if (imageId != null && path != null) {
hashmap.set(imageId, path);
}
}
return hashmap;
}
// Read CSV file
const csvFilePath = "/home/denssumesh/Documents/arguflow/amazon-abo/images.csv";
const csvImageData = await Bun.file(csvFilePath).text();
let imageHashMap = parseCSV(csvImageData);
if (imageHashMap == null) {
console.log("Failed");
}
// Example usage
const directoryPath = "/home/denssumesh/Documents/arguflow/amazon-abo/listings";
fs.readdir(directoryPath, (err, files) => {
if (err) {
console.error("Error reading directory:", err);
return;
}
files.forEach((file) => {
const filePath = path.join(directoryPath, file);
parseItemsFromFile(filePath)
.then((items) => {
console.log(`Processed ${file}: ${items.length} items`);
})
.catch((error) => {
console.error(`Error processing ${file}:`, error);
});
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment