Skip to content

Instantly share code, notes, and snippets.

@inhumantsar
Created April 22, 2024 04:28
Show Gist options
  • Save inhumantsar/7b58d3e475cc8af1f3fba75bb37a85f8 to your computer and use it in GitHub Desktop.
Save inhumantsar/7b58d3e475cc8af1f3fba75bb37a85f8 to your computer and use it in GitHub Desktop.
Cosine Similarity implemented in Typescript
//
// slammed this together after reaading an interesting from-scratch series of posts on TF-IDF + Cosine Similarity.
//
// it will be slow, if it even works. totally untested and full of misguided practices.
// might come back to this some day and make it at least functional.
//
// https://blog.christianperone.com/2011/09/machine-learning-text-feature-extraction-tf-idf-part-i/
//
class Vector extends Map<string, number> {
order: Set<string> | null = null;
static withOrder(v: Vocabulary) {
const vector = new Vector();
vector.order = v;
return vector;
}
orderedEntries() {
return Array.from(this.entries()).sort((a, b) => a[0] === b[0]
? 0
: a[0] < b[0]
? -1
: 1);
}
orderedValues() {
return this.orderedEntries().map((entry) => entry[1]);
}
l2norm() {
const denom = Math.sqrt((this.orderedValues().map((val) => val ^ 2).reduce((prev, cur) => prev + cur)));
this.forEach((val, key) => this.set(key, val / denom));
}
}
type Vocabulary = Set<string>;
class InverseDocumentFrequency {
private vocab: Vocabulary;
private _weights: Vector;
private documentFrequency: Vector;
private docs: CosineSimilarityDocument<any>[];
get weights() { return this._weights.orderedValues(); }
constructor(docs: CosineSimilarityDocument<any>[], vocab: Vocabulary) {
this.docs = docs;
this.vocab = vocab;
this._weights = Vector.withOrder(this.vocab);
this.documentFrequency = Vector.withOrder(this.vocab);
}
updateFrequency(term: string) {
this.documentFrequency.set(term, (this.documentFrequency.get(term) || 0) + 1);
}
updateWeights() {
for (const [term, count] of this.documentFrequency) {
this._weights.set(term, Math.log(this.docs.length / (1 + count)));
}
}
}
class CosineSimilarityDocument<T> {
private doc: T;
private tf: Vector;
private _tfidf: Vector;
private idf: InverseDocumentFrequency;
private vocab: Vocabulary;
private toStringArray: (doc: T) => string[];
get vector() {
if (this.idf.weights.length === 0) this.idf.updateWeights();
if (this._tfidf.size === 0) {
this.tf.orderedEntries().forEach(([term, value], idx) =>
this._tfidf.set(term, value * this.idf.weights[idx]));
this._tfidf.l2norm();
}
return this._tfidf;
}
constructor(doc: T, vocab: Vocabulary, idf: InverseDocumentFrequency, toStringArray: (doc: T) => string[]) {
this.doc = doc;
this.toStringArray = toStringArray;
this.idf = idf;
this.vocab = vocab;
this._tfidf = Vector.withOrder(this.vocab);
this.tf = this.buildTf();
}
private buildTf() {
const terms = this.toStringArray(this.doc);
const v = Vector.withOrder(this.vocab);
for (const term of terms) {
const prev = v.get(term) || 0;
v.set(term, prev + 1);
this.vocab.add(term);
if (prev === 0) this.idf.updateFrequency(term);
}
return v;
}
}
interface CosineSimilaritySearchResult<T> {
readonly doc: CosineSimilarityDocument<T>;
readonly result: number;
}
interface CosineSimilaritySearchResults<T> {
readonly tf: Vector;
readonly weights: number[];
readonly results: CosineSimilaritySearchResult<T>[];
}
class CosineSimilarityController<T> {
readonly vocab: Vocabulary;
readonly idf: InverseDocumentFrequency;
readonly docs: CosineSimilarityDocument<T>[];
readonly toStringArray: (doc: T) => string[];
private _searchCache: Map<T, CosineSimilaritySearchResults<T>>
constructor(docs: T[], toStringArray: (doc: T) => string[]) {
this.toStringArray = toStringArray;
this.vocab = new Set();
this.docs = new Array(docs.length);
this.idf = new InverseDocumentFrequency(this.docs, this.vocab);
this._searchCache = new Map();
docs.forEach((doc, idx) => this.docs[idx] = new CosineSimilarityDocument<T>(doc, this.vocab, this.idf, toStringArray));
}
private getSearchTF(query: T): Vector {
const tf = Vector.withOrder(this.vocab);
const terms = this.toStringArray(query);
for (const term of terms) {
const prev = tf.get(term) || 0;
tf.set(term, prev + 1);
}
return tf;
}
private getCosine(a: Vector, b: Vector) {
let dotProduct = 0;
const aVals = a.orderedValues();
const bVals = b.orderedValues();
for (let idx = 0; idx < a.size; idx++) {
dotProduct += aVals[idx] * bVals[idx];
}
return dotProduct / (
Math.sqrt(aVals.reduce((prev, curr) => prev + curr ^ 2)) *
Math.sqrt(bVals.reduce((prev, curr) => prev + curr ^ 2))
);
}
search(query: T): CosineSimilaritySearchResults<T> {
const tf = this.getSearchTF(query);
const tfidf = Vector.withOrder(this.vocab);
tf.orderedEntries().forEach(([term, value], idx) =>
tfidf.set(term, value * this.idf.weights[idx]));
tfidf.l2norm();
const results = this.docs.map((doc): CosineSimilaritySearchResult<T> => Object({
doc: doc,
result: this.getCosine(tfidf, doc.vector)
})).sort((a, b) => a.result === b.result ? 0
: a.result < b.result ? -1 : 1);
const result = { tf, weights: Array.from(this.idf.weights), results };
this._searchCache.set(query, result);
return result;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment