Skip to content

Instantly share code, notes, and snippets.

@nknapp
Last active March 20, 2018 23:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nknapp/ca76edbaf2871c729409e593e616fa8b to your computer and use it in GitHub Desktop.
Save nknapp/ca76edbaf2871c729409e593e616fa8b to your computer and use it in GitHub Desktop.
searching phashs
// Precompute the number of matching bits by the xor-result of two bytes
const matchingBitsFromXor = []
for (let i = 0; i < 256; i++) {
matchingBitsFromXor[i] = countMatchingBits(i)
}
class PhashBuffer {
constructor (bytesPerHash, maxSize) {
this.buffer = Buffer.alloc(bytesPerHash * maxSize)
this.next = 0
this.bytesPerHash = bytesPerHash
this.maxSize = maxSize
}
add (phash) {
if (this.next >= this.maxSize) {
throw new Error(`Index out of bounds (max: ${this.maxSize}): ${this.next}`)
}
this.buffer.write(phash, this.next * this.bytesPerHash, this.bytesPerHash, 'hex')
this.next++
}
get (index) {
return this.buffer.toString('hex', index * 9, (index + 1) * 9)
}
/**
* Hamming distance between a value (Buffer) and a value stored at an index in the this
* PhashBuffer
* @param {number} compareIndex
* @param {Buffer} value
* @return {number}
*/
hammingDist (compareIndex, value) {
let matchingBits = 0
for (let i = 0; i < this.bytesPerHash; i++) {
let byte1 = value[i]
let byte2 = this.buffer[compareIndex * this.bytesPerHash + i]
let xor = byte1 ^ byte2
matchingBits += matchingBitsFromXor[xor]
// console.log(byte1.toString(2), byte2.toString(2), xor.toString(2), matchingBitsFromXor[xor], matchingBits)
}
return matchingBits
}
/**
* Search for similar values in the buffer
* @param {string} phash hex-encoded phash
* @param {number} threshold minimum number of matching bits for results
*/
search (phash, threshold, maxResults) {
if (phash.length !== this.bytesPerHash * 2) {
throw new Error(`phash ${phash} does not have correct length (actual: ${phash.length}, expected ${this.bytesPerHash * 2}`)
}
const value = Buffer.from(phash, 'hex')
const results = []
let count = 0
for (let index = 0; index < this.maxSize; index++) {
let dist = this.hammingDist(index, value)
// Keep the 20 best items
if (dist >= threshold) {
count++
// If we don't have enough results yet, if the current result is good enough, add it. Then sort
if (results.length < maxResults || results[maxResults - 1].dist < dist) {
results.push({index, dist})
results.sort((x1, x2) => x2.dist - x1.dist)
}
}
}
// Sort reverse by dist
return {
count, results: results.slice(0, maxResults)
}
}
}
/**
* Count the matching bits two bytes based on there biwise-xor value
* @param xor
* @return {number}
*/
function countMatchingBits (xor) {
// Assume all bytes match, xor is zero for all matching bits
let result = 8
for (let n = 0; n < 8; n++) {
// Reduce count if n-th bit of xor is not zero
result -= (xor >> n) & 1
}
return result
}
function performanceTest (nrOfHashes, threshold) {
console.error(`${nrOfHashes / 1000000} million hashes with threshold ${threshold}`)
const hashBuffer = new PhashBuffer(9, nrOfHashes)
const startTime = Date.now()
const nextValue = Buffer.alloc(9)
for (let i = 0; i < nrOfHashes; i++) {
nextValue.writeUInt32BE(i, 5)
hashBuffer.add(nextValue.toString('hex'))
}
console.error(`Buffer filled in ${Date.now() - startTime} ms`)
const searchStartTime = Date.now()
let phash = '000000000000000005'
console.error('Searching for ' + phash)
const {count, results} = hashBuffer.search(phash, threshold, 10)
let searchTime = Date.now() - searchStartTime
console.error(`Searched in ${searchTime} ms`)
console.error(`Got ${count} results`)
console.error(`Best results: `, results.map(({index, dist}) => ({index, dist, value: hashBuffer.get(index)})))
console.error(`---------------------------------------`)
console.log(`TLDR: ${nrOfHashes / 1000000} million and thres ${threshold} => ${searchTime} ms (${count} found)`)
}
console.error = () => {}
performanceTest(10000, 60)
performanceTest(50000, 60)
performanceTest(2000000, 60)
performanceTest(4000000, 60)
performanceTest(4000000, 70)
@nknapp
Copy link
Author

nknapp commented Mar 20, 2018

POC implementation of a buffer containing 72-bit phashes (perceptive hashes for image comparison).
I wanted to know if iterating a buffer like that is a feasible method for finding similar images in a database.
For me, the answer is yes.

Stats for 4 million hashes searched in 139 ms on my laptop

TLDR: 0.05 million and thres 60 => 2 ms (49773 found)
TLDR: 0.01 million and thres 60 => 8 ms (9999 found)
TLDR: 0.05 million and thres 60 => 1 ms (49773 found)
TLDR: 2 million and thres 60 => 72 ms (1653600 found)
TLDR: 4 million and thres 60 => 143 ms (3031018 found)
TLDR: 4 million and thres 70 => 139 ms (254 found)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment