Skip to content

Instantly share code, notes, and snippets.

@ieb
Last active April 17, 2023 14:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ieb/56051b6cdf1cfe377927219bdf6e1f22 to your computer and use it in GitHub Desktop.
Save ieb/56051b6cdf1cfe377927219bdf6e1f22 to your computer and use it in GitHub Desktop.
Is in browser cosine similarity using a tensor flow model viable ?
<html>
<head>
</head>
<body>
This is not doing a real search. Look at the Javascript. It generates a dataset representing a preprocessed set of tensors
representing pages or sentences. This happens during page load as if it was downloaded from a server. Then when the button below
is pressed a tensor representing the search sentence is randomly generated and the cosine distance (euclidian dot product) is calculated.
In the javascript console you should get an idea of how many matches > 0.5 were found, and how long it took, as it will typically
take longer than Chrome likes and event handler to take. On my ancient laptop, I see about 200ms for 10K 1024 length vectors.
This shows the approach is potentially viable from a compute pov..... however the dataset for the pages is 40MB uncompressed, which is not,
and with 1024 from a untrained small language model, the results will probably work for a general languages, but not domain specific vocab.
<button id="searchbutton" >Run Search</button>
<script type="text/javascript">
// shamelessly copied from https://stackoverflow.com/questions/51362252/javascript-cosine-similarity-function
function dotp(x, y) {
function dotp_sum(a, b) {
return a + b;
}
function dotp_times(a, i) {
return x[i] * y[i];
}
return x.map(dotp_times).reduce(dotp_sum, 0);
}
function cosineSimilarity(A,B){
var similarity = dotp(A, B) / (Math.sqrt(dotp(A,A)) * Math.sqrt(dotp(B,B)));
return similarity;
}
//end of copy.
function createTensor(range, length) {
const tensor = [];
for (let j = 0; j < length; j++) {
tensor.push((Math.random()-0.5)*range);
}
return tensor;
}
const tensorLength = 784;
var dataset = [];
for (let i = 0; i < 10000; i++) {
dataset.push(createTensor(1.0E8, 784));
}
console.log("Generated dataset ",dataset, dataset.length);
function search() {
const searchTensor = createTensor(1.0E8, 784);
let nmatches = 0;
for (var i = dataset.length - 1; i >= 0; i--) {
if ( cosineSimilarity(searchTensor, dataset[i]) > 0.2) {
nmatches++;
}
}
console.log("Matches",nmatches);
}
document.querySelector("#searchbutton").addEventListener("click",(e) => {
search();
} );
</script>
</body>
</html>
<html>
<head>
</head>
<body>
<div>
This tests creating a sentence simularity index using tensorflow and then searching it.
It takes about 10s to generate the index using content from www.hlx.live containing 77 pages, and 5s to query that data with
1 sentence "Tell me about everything", indicating that a page in that site knows about everything.
Goto <a href="https://www.hlx.live/docpages-index.json">https://www.hlx.live/docpages-index.json<a>
Copy paste the contents here and click load to create the index
</div>
<textarea name="sentences"> </textarea>
<button id="load" >Load Sentences</button>
<div> Then query</div>
<input type="text" name="query" value="Tell me about everything you know" />
<button id="searchbutton" >Run Search</button>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow/tfjs"></script>
<script src="https://cdn.jsdelivr.net/npm/@tensorflow-models/universal-sentence-encoder"></script>
<script type="text/javascript">
function dotp(x, y) {
function dotp_sum(a, b) {
return a + b;
}
function dotp_times(a, i) {
return x[i] * y[i];
}
return x.map(dotp_times).reduce(dotp_sum, 0);
}
function cosineSimilarity(A,B){
var similarity = dotp(A, B) / (Math.sqrt(dotp(A,A)) * Math.sqrt(dotp(B,B)));
return similarity;
}
function createTensor(range, length) {
const tensor = [];
for (let j = 0; j < length; j++) {
tensor.push((Math.random()-0.5)*range);
}
return tensor;
}
var dataset = [];
function get_embeddings(list_sentences, callback) {
use.load().then(model => {
model.embed(list_sentences).then(embeddings => {
callback(embeddings);
});
});
}
function search(query) {
console.log("Start Query");
const start = new Date();
get_embeddings([query], (embeddings) => {
const searchTensor = embeddings.arraySync()[0];
console.log(searchTensor);
let nmatches = 0;
for (var i = dataset.length - 1; i >= 0; i--) {
if ( cosineSimilarity(searchTensor, dataset[i]) > 0.2) {
nmatches++;
}
}
console.log("Matches",nmatches);
const end = new Date();
console.log("Took ",end.getTime()-start.getTime(),"ms");
});
}
async function loadSentences(sentences) {
console.log("Start Create Index");
const start = new Date();
const s = JSON.parse(sentences);
console.log(s.data);
const l = [];
s.data.forEach((page) => {
l.push(page.content);
});
get_embeddings(l,(embeddings) => {
dataset = embeddings.arraySync();
console.log("Dataset Now", dataset);
const end = new Date();
console.log("Took ",end.getTime()-start.getTime(),"ms");
});
}
document.querySelector("#searchbutton").addEventListener("click",(e) => {
search(document.querySelector('[name="query"]').value);
});
document.querySelector("#load").addEventListener("click", async (e) => {
await loadSentences(document.querySelector('[name="sentences"]').value);
} );
</script>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment