Skip to content

Instantly share code, notes, and snippets.

@faustomorales
Last active November 3, 2020 09:19
Show Gist options
  • Save faustomorales/65f5d523aa6212766ff2fd2a4ade563d to your computer and use it in GitHub Desktop.
Save faustomorales/65f5d523aa6212766ff2fd2a4ade563d to your computer and use it in GitHub Desktop.
Compare PDQ Hash performance using pure Python versus bindings to C++ implementation. Results show 262ms for pure Python and 7ms for the bindings.
# Before running this script, you must clone the ThreatExchange
# repo (containing the pure Python implementation)
# and install the pdqhash package (containing the C bindings).
# You can do this by executing the following at the command line.
# git clone https://github.com/facebook/ThreatExchange.git
# pip install pdqhash
import urllib.request
import timeit
import sys
import os
# Make sure Python can find the Python implementation
sys.path.insert(0, os.path.abspath('ThreatExchange/hashing/pdq/python'))
# Import the bindings package as well as the
# pure Python implementation module
import pdqhashing.hasher.pdq_hasher as pdqpython
import pdqhash as pdqcython
import numpy as np
import cv2
# Download an example image
urllib.request.urlretrieve(
"https://commons.wikimedia.org/w/thumb.php?f=Actinoscyphia_aurelia_1.jpg&w=256",
"example.jpg"
)
# We want an apples to apples comparison so
# we include I/O as part of the test for both
# approaches.
pyhasher = pdqpython.PDQHasher()
def pdq_using_cython_bindings(filepath):
image = cv2.imread(filepath)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
return pdqcython.compute(image)
def pdq_using_pure_python(filepath):
return pyhasher.fromFile(filepath)
n_runs = 100
print('Running Python benchmark.')
time_python = timeit.timeit(
stmt="pdq_using_pure_python('example.jpg')",
globals=globals(),
number=n_runs
) / n_runs
print('Running Cython bindings benchmark.')
time_cython = timeit.timeit(
stmt="pdq_using_cython_bindings('example.jpg')", globals=globals(),
number=n_runs
) / n_runs
# Here we make sure the output is identical for both the bindings and the
# pure Python implementation.
# The reshape operation is used to put the bits in the same order as
# that shown using:
# ./pdq-photo-hasher example.jpg | \
# cut -d',' -f1 | \
# ./hashtool256 bits | tr ' ' ','
hqpython = pdq_using_pure_python('example.jpg')
vector_cython, quality_cython = pdq_using_cython_bindings('example.jpg')
np.testing.assert_equal(
vector_cython.astype('int'),
np.array(hqpython.getHash().dumpBitsAcross().split(' ')).astype('int').reshape(16, 16)[::-1].flatten()
)
assert quality_cython == hqpython.getQuality()
print(f'Running with pure Python implementation took {round(1000*time_python)}ms, on average.')
print(f'Running with Cython bindings took {round(1000*time_cython)}ms, on average.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment