Skip to content

Instantly share code, notes, and snippets.

@BusinessDuck
Last active December 4, 2023 18:10
Show Gist options
  • Save BusinessDuck/6d3712d1dcf9658fff12cbe2fa150ce0 to your computer and use it in GitHub Desktop.
Save BusinessDuck/6d3712d1dcf9658fff12cbe2fa150ce0 to your computer and use it in GitHub Desktop.
/* eslint-disable no-console */
const crypto = require('crypto');
/**
* Modulus operation with a twist. It takes two parameters, a and b, and returns the remainder
* of the division of a by b. However, there's a condition to handle the case when a is negative.
*/
function mod(a, b) {
return ((a % b) + b) % b;
}
/**
* Winnowing algorythm class implementation for document fingerprinting
*/
class Winnowing {
constructor(k = 40, salt = "", windowSize = 4, removeDuplicates = true) {
this.k = k;
this.salt = salt;
this.windowSize = windowSize;
this.removeDuplicates = removeDuplicates;
}
/**
* Given a document, computes all k-gram hashes and uses the
* winnowing algorithm to reduce their number. Optionally takes a
* list of boilerplate hashes to remove from the winnowed list.
* Returns the selected hashes and their indexes in the original list
*/
getHash(text) {
const [hashes, idx] = this._winnow(this._hashed_kgrams(text), this.windowSize);
return [hashes, idx];
}
/**
* Main method for this class to create all data from text document
*/
getFingerprint(text) {
const [hashes, idx] = this.getHash(text);
const token_len = text.length;
const token_coverage = this._getTokenCoverage(idx, this.k, token_len);
return [hashes, idx, token_coverage];
}
/**
* Determines the number of tokens in the original document which are included in the winnowed indices
*/
_getTokenCoverage(idx, k, token_len) {
const coverage = Array(token_len).fill(0);
for (let i = 0; i < idx.length; i++) {
for (let offset = 0; offset < k; offset++) {
if (idx[i] + offset < token_len) {
coverage[idx[i] + offset] = 1;
}
}
}
return coverage.reduce((acc, val) => acc + val, 0);
}
/**
* JavaScript implementation of the winnowing algorithm.
* Input and output are identical to the https://github.com/blingenf/copydetect
*/
_winnowing(hashes, windowSize) {
const selectedIdx = [];
const buffer = Array(windowSize).fill(Infinity);
let r = 0;
let minIdx = 0;
for (let hashIdx = 0; hashIdx < hashes.length; hashIdx++) {
const hashVal = hashes[hashIdx];
r = (r + 1) % windowSize;
buffer[r] = hashVal;
if (minIdx === r) {
let i = (r - 1 + windowSize) % windowSize;
while (i !== r) {
if (buffer[i] < buffer[minIdx]) {
minIdx = i;
}
i = (i - 1 + windowSize) % windowSize;
}
selectedIdx.push(hashIdx - (mod((r - minIdx), this.windowSize)));
} else if (buffer[r] < buffer[minIdx]) {
minIdx = r;
selectedIdx.push(hashIdx);
}
}
return selectedIdx;
}
/**
* implementation of the robust winnowing algorithm decribed
* in https://theory.stanford.edu/~aiken/publications/papers/sigmod03.pdf
* Returns a list of selected hashes and the indexes of those hashes.
*/
_winnow(hashes, window_size) {
if (window_size < 1) {
throw new Error("window_size must be greater than 0");
}
let selected_hashes;
let selected_idx;
if (window_size === 1) {
selected_hashes = hashes;
selected_idx = Array.from({ length: hashes.length }, (_, i) => i);
} else {
selected_idx = this._winnowing(hashes, window_size);
selected_hashes = selected_idx.map(idx => hashes[idx]);
}
if (this.removeDuplicates) {
const uniqueMap = new Map();
selected_hashes.forEach((hash, idx) => {
if (!uniqueMap.has(hash)) {
uniqueMap.set(hash, selected_idx[idx]);
}
});
selected_hashes = Array.from(uniqueMap.keys());
selected_idx = Array.from(uniqueMap.values());
}
return [selected_hashes, selected_idx];
}
_hash(x) {
const sha256_hash = crypto.createHash('sha256');
sha256_hash.update(String(x));
const hash_bytes = sha256_hash.digest();
// Convert hash bytes to BigInt
const bigIntHash = BigInt('0x' + hash_bytes.toString('hex'));
// Truncate
const truncated_hash = bigIntHash % BigInt(2 ** 64) - BigInt(2 ** 63);
return truncated_hash; // Keep as BigInt
}
/**
* Return hashes of all k-grams in a strin
*/
_hashed_kgrams(string) {
const hashes = [];
const length = string.length;
if (length < this.k) {
return [this._hash(string) + this.salt];
}
for (let offset = 0; offset <= length - this.k; offset++) {
hashes.push(this._hash(string.substring(offset, offset + this.k) + this.salt));
}
return hashes;
}
}
console.log("Starting the winnowing process...");
// Instantiate the Winnowing class
const winnowing = new Winnowing();
const text = `V="S"print("S","S","S")forVinrange(V,V+1):ifV>1:forVinrange(2,V)`;
// Get hashes and indices for the provided text
console.log("Computing hashes and indices...");
const [hashes, idx, token_coverage] = winnowing.getFingerprint(text);
console.log("Hashes and indices computed.");
console.log(hashes, idx, token_coverage);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment