Last active
June 5, 2024 01:47
-
-
Save anuragd-dataslush/9420e16a0d22232c229b2533521c833c to your computer and use it in GitHub Desktop.
Building Smart PDFs: OpenAI/Gemini, Langchain & pgvector (Node.js)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### docker-compose.yml | |
version: "3" | |
services: | |
db: | |
image: ankane/pgvector | |
volumes: | |
- ./data:/var/lib/postgresql/data | |
environment: | |
POSTGRES_USER: {{user}} | |
POSTGRES_PASSWORD: {{password}} | |
POSTGRES_DB: {{db name}} | |
ports: | |
- "5432:5432" | |
pgadmin: | |
container_name: {{container name}} | |
image: dpage/pgadmin4 | |
restart: always | |
environment: | |
PGADMIN_DEFAULT_EMAIL: {{Email}} | |
PGADMIN_DEFAULT_PASSWORD: {{pg_admin_password}} | |
ports: | |
- "8080:80" | |
### config/pgAndVectorConfig.js | |
export const getVectorConfig = (tableName) => { | |
const vectorConfig = { | |
postgresConnectionOptions: { | |
type: "postgres", | |
host: "localhost", | |
port: 5432, //same as docker-config file | |
password: {{password}}, | |
database: {{database}}, | |
user: {{user}}, | |
}, | |
tableName, | |
columns: { | |
idColumnName: "id", | |
vectorColumnName: "vector", | |
contentColumnName: "content", | |
metadataColumnName: "metadata", | |
}, | |
}; | |
return vectorConfig | |
}; | |
### SERVER.JS | |
import express from "express"; | |
import cors from "cors"; | |
import multer from "multer"; | |
import fs from "fs"; | |
import chatRoutes from "./routes/chat-route.js"; | |
import fileUploadRoutes from "./routes/fileUploader-route.js"; | |
import dotenv from "dotenv"; | |
dotenv.config(); | |
import bodyParser from "body-parser"; | |
const multerStorage = multer.diskStorage({ | |
//create a folder where we store uploaded file | |
destination: (req, file, cb) => { | |
const fileName = file?.originalname?.split(".")[0]; | |
const path = "./uploads/" + fileName; | |
fs.mkdirSync(path, { recursive: true }); | |
cb(null, path); | |
}, | |
//set the file Name | |
filename: (req, file, cb) => { | |
cb(null, file.originalname); | |
}, | |
}); | |
const upload = multer({ storage: multerStorage }); | |
const server = express(); | |
server.use( | |
cors({ | |
origin: "*", | |
}) | |
); | |
server.use(bodyParser.json()); | |
server.use("/chat", chatRoutes); | |
server.use("/files", upload.array("file"), fileUploadRoutes); | |
server.listen(5000, () => { | |
console.log("server is listening on port 5000"); | |
}); | |
### FILE UPLOAD FUNCTIONALITY | |
import fs from "fs"; | |
import chalk from "chalk"; | |
import { PDFLoader } from "langchain/document_loaders/fs/pdf"; | |
import { GoogleVertexAIEmbeddings } from "langchain/embeddings/googlevertexai"; | |
import { PGVectorStore } from "langchain/vectorstores/pgvector"; | |
import { CharacterTextSplitter } from "langchain/text_splitter"; | |
import { | |
getVectorConfig, | |
} from "../config/pgAndVectorConfig.js"; | |
import { updateUserFilesTable } from "../common/updateUserFilesTable.js"; | |
import { removeSymbolsFromName } from "../common/removeSymbolsFromName.js"; | |
export const fileUpload = async (req, res, next) => { | |
const file = req.files[0]; | |
const filePath = file.path; | |
const name = file.filename?.split(".")[0]; | |
const fileName = removeSymbolsFromName(name); | |
const fileExtension = file.mimetype; | |
if (fileExtension !== "application/pdf") { | |
return res.status(400).send("Please upload a PDF file only"); | |
} | |
if (fileExtension === "application/pdf") { | |
console.log(chalk.red("APPLICATION/PDF")); | |
const loader = new PDFLoader(filePath, { | |
splitPages: true, | |
}); | |
const docs = await loader.load(); | |
const splitter = new CharacterTextSplitter({ | |
separator: "", | |
chunkSize: 1000, // how big the context is in one page | |
chunkOverlap: 100, // how much from the previous page we want to overlap in current page | |
}); | |
const splitDocs = await splitter.splitDocuments(docs); | |
const dockWithCustomMetaData = splitDocs.map((doc) => { | |
const dataToReturn = { | |
...doc, | |
metadata: { | |
loc: { | |
...doc.metadata.loc, | |
}, | |
}, | |
}; | |
return dataToReturn; | |
}); | |
const vectorConfig = getVectorConfig(fileName + "_pdf"); | |
try { | |
const embeddings = new GoogleVertexAIEmbeddings({ | |
verbose: true, | |
}); | |
await PGVectorStore.fromDocuments( | |
dockWithCustomMetaData, | |
embeddings, | |
vectorConfig | |
); | |
return res.json({ | |
status: "success", | |
message: "Successfully uploaded and parsed the CSV file", | |
fileName: fileName, | |
fileType: fileExtension, | |
}); | |
} catch (err) { | |
console.error(chalk.red("pgVectorStore connection error "), err); | |
return res.status(500).json({ message: err, status: "error" }); | |
} | |
} | |
}; | |
### PDF CHAT CONTROLLER | |
import fs from "fs"; | |
import chalk from "chalk"; | |
import { GoogleVertexAIEmbeddings } from "langchain/embeddings/googlevertexai"; | |
import { PGVectorStore } from "langchain/vectorstores/pgvector"; | |
import { | |
getVectorConfig, | |
} from "../config/pgAndVectorConfig.js"; | |
import { updateUserFilesTable } from "../common/updateUserFilesTable.js"; | |
import { removeSymbolsFromName } from "../common/removeSymbolsFromName.js"; | |
export const postMessage = async (req, response, next) => { | |
const input = req.body.message.value; | |
const fileType = req.body.message.fileType; | |
const table_name = req.body.message.fileName; | |
const isPdf = fileType === "pdf" ? true : false; | |
console.log("info: tabl_name", table_name, fileType, isPdf); | |
try { | |
const vectorConfig = getVectorConfig(table_name); | |
const embeddings = new GoogleVertexAIEmbeddings({ | |
verbose: true, | |
}); | |
const pgVectorStore = await PGVectorStore.initialize( | |
embeddings, | |
vectorConfig | |
); | |
const vectorStoreRetriever = pgVectorStore.asRetriever(); | |
const messages = [ | |
SystemMessagePromptTemplate.fromTemplate(SYSTEM_TEMPLATE), | |
HumanMessagePromptTemplate.fromTemplate("{question}"), | |
]; | |
const prompt = ChatPromptTemplate.fromMessages(messages); | |
const chain = RunnableSequence.from([ | |
{ | |
context: vectorStoreRetriever.pipe(formatDocumentsAsString), | |
question: new RunnablePassthrough(), | |
}, | |
prompt, | |
model, | |
new StringOutputParser(), | |
]); | |
const pdfResult = await pgVectorStore.similaritySearch(input, 2); | |
const metaData = pdfResult.map((data) => data.metadata.loc); | |
const answer = await chain.invoke(input); | |
return response.send({ | |
message: answer, | |
citation: metaData, | |
}); | |
}catch(err){ | |
return response.send({ | |
message: err, | |
}); | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment