josephrocca/e5-large-v2.js

## e5-large-v2.js
// See the comments at the end for a model that does much better than e5-large-v2 while being a third of the size.

let { pipeline } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.7.0');
let extractor = await pipeline('feature-extraction', 'Xenova/e5-large-v2');

// Note: If you're just comparing "passages" with one another, then just prepend "passage: " to all texts. Only use "query: " if the text is a short "search query" like in the above example.
let passage1 = await extractor(`passage: The Shawshank Redemption is a true masterpiece of cinema.`, { pooling: 'mean', normalize: true });
let passage2 = await extractor(`passage: The film should not be exposed to sunlight when removing it from the wrapper. Otherwise your movie will come out bad.`, { pooling: 'mean', normalize: true });
let query = await extractor(`query: movie review`, { pooling: 'mean', normalize: true });

// now use dot product to compare query with passages:

function dotProduct(vec1, vec2) {
  let result = 0;
  for(let i = 0; i < vec1.length; i++) {
    result += vec1[i] * vec2[i];
  }
  return result;
}

console.log(dotProduct(query.data, passage1.data));
console.log(dotProduct(query.data, passage2.data));
console.log("yep, these numbers are very close - use bge-base-en-v1.5 for much better accuracy");

// Note: The `bge-base-en-v1.5` model has been released and seems to be better that `e5-large-v2` while being significantly smaller - use it like this:

// let extractor = await pipeline('feature-extraction', 'Xenova/bge-base-en-v1.5');
// let passage1 = await extractor('The Shawshank Redemption is a true masterpiece of cinema.', { pooling: 'mean', normalize: true });
// let passage2 = await extractor(`The film should not be exposed to sunlight when removing it from the wrapper. Otherwise your movie will come out bad.`, { pooling: 'mean', normalize: true });
// let query = await extractor('Represent this sentence for searching relevant passages: film review', { pooling: 'mean', normalize: true });

// Note that the *exact* text "Represent this sentence for searching relevant passages:" must be prepended for "queries" (as opposed to "passages", since that's how it was trained. Although they say that in v1.5 they've reduced the need for the instruction in some cases.
// See readme for more details: https://huggingface.co/BAAI/bge-base-en-v1.5

// Also, keep an eye on this leaderboard to fine new, better models: https://huggingface.co/spaces/mteb/leaderboard
	// See the comments at the end for a model that does much better than e5-large-v2 while being a third of the size.

	let { pipeline } = await import('https://cdn.jsdelivr.net/npm/@xenova/transformers@2.7.0');
	let extractor = await pipeline('feature-extraction', 'Xenova/e5-large-v2');

	// Note: If you're just comparing "passages" with one another, then just prepend "passage: " to all texts. Only use "query: " if the text is a short "search query" like in the above example.
	let passage1 = await extractor(`passage: The Shawshank Redemption is a true masterpiece of cinema.`, { pooling: 'mean', normalize: true });
	let passage2 = await extractor(`passage: The film should not be exposed to sunlight when removing it from the wrapper. Otherwise your movie will come out bad.`, { pooling: 'mean', normalize: true });
	let query = await extractor(`query: movie review`, { pooling: 'mean', normalize: true });

	// now use dot product to compare query with passages:

	function dotProduct(vec1, vec2) {
	let result = 0;
	for(let i = 0; i < vec1.length; i++) {
	result += vec1[i] * vec2[i];
	}
	return result;
	}

	console.log(dotProduct(query.data, passage1.data));
	console.log(dotProduct(query.data, passage2.data));
	console.log("yep, these numbers are very close - use bge-base-en-v1.5 for much better accuracy");

	// Note: The `bge-base-en-v1.5` model has been released and seems to be better that `e5-large-v2` while being significantly smaller - use it like this:

	// let extractor = await pipeline('feature-extraction', 'Xenova/bge-base-en-v1.5');
	// let passage1 = await extractor('The Shawshank Redemption is a true masterpiece of cinema.', { pooling: 'mean', normalize: true });
	// let passage2 = await extractor(`The film should not be exposed to sunlight when removing it from the wrapper. Otherwise your movie will come out bad.`, { pooling: 'mean', normalize: true });
	// let query = await extractor('Represent this sentence for searching relevant passages: film review', { pooling: 'mean', normalize: true });

	// Note that the exact text "Represent this sentence for searching relevant passages:" must be prepended for "queries" (as opposed to "passages", since that's how it was trained. Although they say that in v1.5 they've reduced the need for the instruction in some cases.
	// See readme for more details: https://huggingface.co/BAAI/bge-base-en-v1.5

	// Also, keep an eye on this leaderboard to fine new, better models: https://huggingface.co/spaces/mteb/leaderboard