Skip to content

Instantly share code, notes, and snippets.

@yharaskrik
Created June 5, 2024 16:44
Show Gist options
  • Save yharaskrik/657ce8e25bc161013dc765edfa8a050b to your computer and use it in GitHub Desktop.
Save yharaskrik/657ce8e25bc161013dc765edfa8a050b to your computer and use it in GitHub Desktop.
import { TensorFlowEmbeddings } from '@langchain/community/embeddings/tensorflow';
import {
ChatPromptTemplate,
HumanMessagePromptTemplate,
SystemMessagePromptTemplate,
} from '@langchain/core/prompts';
import { ChatOpenAI } from '@langchain/openai';
import '@tensorflow/tfjs-node';
import { config } from 'dotenv';
import { MemoryVectorStore } from 'langchain/vectorstores/memory';
import { z } from 'zod';
config();
const matchingAlgorithm = `Two constituents are the same if they have the same address. If there are more than one unique constituent at the same address use the first and last names as tie breakers`;
const matchingCriteria = await new ChatPromptTemplate({
promptMessages: [
SystemMessagePromptTemplate.fromTemplate(
`You are an expert at parsing instructions by users that will define one or more primary and secondary
matching criteria to use to compare potential duplicate constituents.`,
),
HumanMessagePromptTemplate.fromTemplate(`{input}`),
],
inputVariables: ['input'],
})
.pipe(
new ChatOpenAI({
modelName: 'gpt-4o',
temperature: 0,
}).withStructuredOutput(
z.object({
primaryCriteria: z
.array(
z.enum([
'firstName',
'lastName',
'email',
'address',
'phoneNumber',
]),
)
.describe(
`An array of primary matching criteria to use to match constituents.`,
),
}),
),
)
.invoke({
input: matchingAlgorithm,
});
console.log(JSON.stringify(matchingCriteria, null, 2));
const people = [
{
id: '257e18e5-c34d-41a0-94bd-bd02fdf20be2',
firstName: 'Sarah',
lastName: 'Connor',
email: 'sarah.connor@domain.com',
address:
'309 West Lincoln Avenue, Apartment 4, Los Angeles, California, USA',
phoneNumber: '(408) 555-4822',
},
{
id: '2a19be23-721d-4794-8b81-c49339051012',
firstName: 'John',
lastName: 'Connor',
email: 'john.connor@domain.com',
address:
'309 West Lincoln Avenue, Apartment 4, Los Angeles, California, USA',
phoneNumber: '(408) 555-4822',
},
{
id: '8070ac7f-693e-413b-b289-f2998a7f7697',
firstName: 'Miles',
lastName: 'Dyson',
email: 'm.dyson@cyberdynesystems.com',
address: '2144 Kramer Street, Los Angeles, California, USA',
phoneNumber: '(555) 332 1754',
},
];
// Define an InMemory Vector store to query our vectors and use TensorFlow embeddings to calculate vectors.
const vectorStore = new MemoryVectorStore(new TensorFlowEmbeddings());
for (const person of people) {
for (const primary of matchingCriteria.primaryCriteria) {
await vectorStore.addDocuments([
{
// The "content" of the vector is the primary criteria of this person.
pageContent: person[primary],
metadata: {
person,
},
},
]);
}
}
// Input provided by user that we cannot directly match to a record in the database.
const input = {
firstName: 'Sarah',
lastName: 'Connor',
address: '309 West Lincoln Avenue, Apt 4, Los Angeles, California, USA',
email: 'sarah.connor@domain.com',
phoneNumber: '(408) 555-4822',
} as const;
// Top 2 vectors by similarity score.
const topK = 2;
// Find the top 2 closest vectors for each primary matching criteria.
const possibleDuplicates = (
await Promise.all(
matchingCriteria.primaryCriteria.map((criteria) =>
vectorStore.similaritySearchWithScore(input[criteria], topK),
),
)
)
.flat()
.map((value) => value[0]);
console.log(JSON.stringify(possibleDuplicates, null, 2));
const chain = new ChatPromptTemplate({
promptMessages: [
SystemMessagePromptTemplate.fromTemplate(
`You are an expert in Data Analysis and matching user entered information to information in a CRM to reduce duplicates.
You should match the input data to the potential duplicates using the following matching algorithm:
# Matching Algorithm
{matchingAlgorithm}`,
),
HumanMessagePromptTemplate.fromTemplate(
`
Does this person:
{input}
Match one of these people:
{possibleMatches}
`,
),
],
inputVariables: ['input', 'matchingAlgorithm', 'possibleMatches'],
}).pipe(
new ChatOpenAI({
modelName: 'gpt-4o',
temperature: 0,
}).withStructuredOutput(
z.object({
matchingId: z
.string()
.nullable()
.describe(
'ID of the matching best matched person, if there is one.',
),
}),
),
);
const result = await chain.invoke({
input: JSON.stringify(input),
matchingAlgorithm,
possibleMatches: possibleDuplicates
.map(
(possibleDuplicate, index) =>
`## Option ${index + 1} - ID ${
possibleDuplicate.metadata.person['id']
}\n${JSON.stringify(possibleDuplicate.metadata.person)}`,
)
.join('\n'),
});
console.log(JSON.stringify(result, null, 2));
console.log(people.find((person) => person.id === result.matchingId));
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment