Skip to content

Instantly share code, notes, and snippets.

@anuragd-dataslush
Last active June 5, 2024 01:47
Show Gist options
  • Save anuragd-dataslush/9420e16a0d22232c229b2533521c833c to your computer and use it in GitHub Desktop.
Save anuragd-dataslush/9420e16a0d22232c229b2533521c833c to your computer and use it in GitHub Desktop.
Building Smart PDFs: OpenAI/Gemini, Langchain & pgvector (Node.js)
### docker-compose.yml
version: "3"
services:
db:
image: ankane/pgvector
volumes:
- ./data:/var/lib/postgresql/data
environment:
POSTGRES_USER: {{user}}
POSTGRES_PASSWORD: {{password}}
POSTGRES_DB: {{db name}}
ports:
- "5432:5432"
pgadmin:
container_name: {{container name}}
image: dpage/pgadmin4
restart: always
environment:
PGADMIN_DEFAULT_EMAIL: {{Email}}
PGADMIN_DEFAULT_PASSWORD: {{pg_admin_password}}
ports:
- "8080:80"
### config/pgAndVectorConfig.js
export const getVectorConfig = (tableName) => {
const vectorConfig = {
postgresConnectionOptions: {
type: "postgres",
host: "localhost",
port: 5432, //same as docker-config file
password: {{password}},
database: {{database}},
user: {{user}},
},
tableName,
columns: {
idColumnName: "id",
vectorColumnName: "vector",
contentColumnName: "content",
metadataColumnName: "metadata",
},
};
return vectorConfig
};
### SERVER.JS
import express from "express";
import cors from "cors";
import multer from "multer";
import fs from "fs";
import chatRoutes from "./routes/chat-route.js";
import fileUploadRoutes from "./routes/fileUploader-route.js";
import dotenv from "dotenv";
dotenv.config();
import bodyParser from "body-parser";
const multerStorage = multer.diskStorage({
//create a folder where we store uploaded file
destination: (req, file, cb) => {
const fileName = file?.originalname?.split(".")[0];
const path = "./uploads/" + fileName;
fs.mkdirSync(path, { recursive: true });
cb(null, path);
},
//set the file Name
filename: (req, file, cb) => {
cb(null, file.originalname);
},
});
const upload = multer({ storage: multerStorage });
const server = express();
server.use(
cors({
origin: "*",
})
);
server.use(bodyParser.json());
server.use("/chat", chatRoutes);
server.use("/files", upload.array("file"), fileUploadRoutes);
server.listen(5000, () => {
console.log("server is listening on port 5000");
});
### FILE UPLOAD FUNCTIONALITY
import fs from "fs";
import chalk from "chalk";
import { PDFLoader } from "langchain/document_loaders/fs/pdf";
import { GoogleVertexAIEmbeddings } from "langchain/embeddings/googlevertexai";
import { PGVectorStore } from "langchain/vectorstores/pgvector";
import { CharacterTextSplitter } from "langchain/text_splitter";
import {
getVectorConfig,
} from "../config/pgAndVectorConfig.js";
import { updateUserFilesTable } from "../common/updateUserFilesTable.js";
import { removeSymbolsFromName } from "../common/removeSymbolsFromName.js";
export const fileUpload = async (req, res, next) => {
const file = req.files[0];
const filePath = file.path;
const name = file.filename?.split(".")[0];
const fileName = removeSymbolsFromName(name);
const fileExtension = file.mimetype;
if (fileExtension !== "application/pdf") {
return res.status(400).send("Please upload a PDF file only");
}
if (fileExtension === "application/pdf") {
console.log(chalk.red("APPLICATION/PDF"));
const loader = new PDFLoader(filePath, {
splitPages: true,
});
const docs = await loader.load();
const splitter = new CharacterTextSplitter({
separator: "",
chunkSize: 1000, // how big the context is in one page
chunkOverlap: 100, // how much from the previous page we want to overlap in current page
});
const splitDocs = await splitter.splitDocuments(docs);
const dockWithCustomMetaData = splitDocs.map((doc) => {
const dataToReturn = {
...doc,
metadata: {
loc: {
...doc.metadata.loc,
},
},
};
return dataToReturn;
});
const vectorConfig = getVectorConfig(fileName + "_pdf");
try {
const embeddings = new GoogleVertexAIEmbeddings({
verbose: true,
});
await PGVectorStore.fromDocuments(
dockWithCustomMetaData,
embeddings,
vectorConfig
);
return res.json({
status: "success",
message: "Successfully uploaded and parsed the CSV file",
fileName: fileName,
fileType: fileExtension,
});
} catch (err) {
console.error(chalk.red("pgVectorStore connection error "), err);
return res.status(500).json({ message: err, status: "error" });
}
}
};
### PDF CHAT CONTROLLER
import fs from "fs";
import chalk from "chalk";
import { GoogleVertexAIEmbeddings } from "langchain/embeddings/googlevertexai";
import { PGVectorStore } from "langchain/vectorstores/pgvector";
import {
getVectorConfig,
} from "../config/pgAndVectorConfig.js";
import { updateUserFilesTable } from "../common/updateUserFilesTable.js";
import { removeSymbolsFromName } from "../common/removeSymbolsFromName.js";
export const postMessage = async (req, response, next) => {
const input = req.body.message.value;
const fileType = req.body.message.fileType;
const table_name = req.body.message.fileName;
const isPdf = fileType === "pdf" ? true : false;
console.log("info: tabl_name", table_name, fileType, isPdf);
try {
const vectorConfig = getVectorConfig(table_name);
const embeddings = new GoogleVertexAIEmbeddings({
verbose: true,
});
const pgVectorStore = await PGVectorStore.initialize(
embeddings,
vectorConfig
);
const vectorStoreRetriever = pgVectorStore.asRetriever();
const messages = [
SystemMessagePromptTemplate.fromTemplate(SYSTEM_TEMPLATE),
HumanMessagePromptTemplate.fromTemplate("{question}"),
];
const prompt = ChatPromptTemplate.fromMessages(messages);
const chain = RunnableSequence.from([
{
context: vectorStoreRetriever.pipe(formatDocumentsAsString),
question: new RunnablePassthrough(),
},
prompt,
model,
new StringOutputParser(),
]);
const pdfResult = await pgVectorStore.similaritySearch(input, 2);
const metaData = pdfResult.map((data) => data.metadata.loc);
const answer = await chain.invoke(input);
return response.send({
message: answer,
citation: metaData,
});
}catch(err){
return response.send({
message: err,
});
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment