Skip to content

Instantly share code, notes, and snippets.

@jstepien
Last active December 1, 2019 10:15
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save jstepien/d24a06b09b8bf067fe8bec333a6c3599 to your computer and use it in GitHub Desktop.
Save jstepien/d24a06b09b8bf067fe8bec333a6c3599 to your computer and use it in GitHub Desktop.
It All Looks the Same to Me

In which Rust, perceptual hashing, and BK-trees come together.

cargo test
cargo build --release
< needle_hashes.txt ./target/release/bk phash_output.txt
< needle_hashes.txt TOLERANCE=10 ./target/release/bk phash_output.txt
[package]
name = "bk"
version = "0.1.0"
authors = ["Jan Stępień"]
use std::env;
use std::process;
use std::io::{self, BufRead, BufReader};
use std::collections::{BTreeMap, HashMap};
type Hash = u64;
type Distance = u32;
enum BKTree {
Empty,
NonEmpty(Node),
}
struct Node {
hash: Hash,
children: BTreeMap<Distance, Node>,
}
impl BKTree {
fn new() -> BKTree {
BKTree::Empty
}
fn insert(&mut self, new: Hash) {
use BKTree::*;
match *self {
Empty => *self = NonEmpty(Node::single(new)),
NonEmpty(ref mut node) => node.insert(new),
}
}
fn find(&self, needle: Hash, tolerance: Distance) -> Vec<Hash> {
use BKTree::*;
match self {
&Empty => Vec::new(),
&NonEmpty(ref node) => node.find(needle, tolerance),
}
}
}
impl Node {
fn single(hash: Hash) -> Node {
Node {
hash: hash,
children: BTreeMap::new(),
}
}
fn insert(&mut self, new: Hash) {
if self.hash != new {
let new_dist = distance(new, self.hash);
self.children
.entry(new_dist)
.or_insert(Node::single(new))
.insert(new)
}
}
fn find(&self, needle: Hash, tolerance: Distance) -> Vec<Hash> {
let mut result = Vec::new();
let needle_dist = distance(self.hash, needle);
if needle_dist <= tolerance {
result.push(self.hash)
}
for (&dist, ref tree) in &self.children {
if dist + tolerance >= needle_dist && needle_dist + tolerance >= dist {
let found = tree.find(needle, tolerance);
for hash in found {
result.push(hash)
}
}
}
result
}
}
#[test]
fn test_insertion_and_finding() {
let mut t = BKTree::new();
t.insert(0x2af7);
t.insert(0x6af7);
for &(needle, dist, ref expected) in &[
(0x6af7, 0, vec![0x6af7]),
(0x6bf7, 1, vec![0x6af7]),
(0x6bf7, 2, vec![0x2af7, 0x6af7]),
(0x6bf6, 1, vec![]),
(0x6bf6, 2, vec![0x6af7]),
(0x6bf6, 3, vec![0x2af7, 0x6af7]),
] {
let found = t.find(needle, dist);
assert_eq!(&found, expected);
}
}
fn to_hash(s: &str) -> Hash {
Hash::from_str_radix(s, 16).expect("parsing hash")
}
fn distance(a: Hash, b: Hash) -> Distance {
(a ^ b).count_ones()
}
fn find_by_hashes<'a>(mapping: &'a HashMap<Hash, Vec<String>>, hashes: &[Hash]) -> Vec<&'a String> {
hashes
.iter()
.map(|hash| mapping.get(hash))
.filter(|opt| opt.is_some())
.flat_map(|opt| opt.expect("must be some"))
.collect()
}
fn load_from_file(name: &str) -> (BKTree, HashMap<Hash, Vec<String>>) {
use std::fs::*;
let file = File::open(name).expect("opening file");
let mut tree = BKTree::new();
let mut files = HashMap::new();
for line in BufReader::new(file).lines().map(|l| l.unwrap()) {
let chunks: Vec<&str> = line.splitn(2, ' ').collect();
let hash = to_hash(&chunks[0]);
let value = String::from(chunks[1]);
tree.insert(hash);
let mut entry = files.entry(hash).or_insert_with(|| Vec::new());
entry.push(value);
}
(tree, files)
}
fn search(hashes_file: &str) {
use std::str::FromStr;
let tolerance = std::env::var("TOLERANCE")
.map(|s| u32::from_str(&s))
.unwrap_or(Ok(12))
.expect("parsing TOLERANCE");
let (tree, files) = load_from_file(hashes_file);
let stdin = io::stdin();
for line in stdin.lock().lines().map(|l| l.unwrap()) {
assert!(line.len() == 16);
let similar = tree.find(to_hash(&line), tolerance);
if similar.len() > 1 {
print!("{} ", line);
for val in find_by_hashes(&files, &similar) {
print!("'{}' ", val);
}
println!("");
}
}
}
fn usage_and_exit(exitcode: i32) {
eprintln!("usage: bk phash_output < needle_hashes");
process::exit(exitcode);
}
fn main() {
let args: Vec<_> = env::args().skip(1).collect();
if args.len() != 1 {
usage_and_exit(1);
}
match args[0].as_str() {
"-h" => usage_and_exit(0),
input => search(&input),
}
}
01f0fc9eff3b01c0
15066e39914e17f7
11e9dcfe3e3f0180 a.jpg
01f0fcdebf3b01c0 b.jpg
15066e39914e17f7 c.jpg
35066a79990aa7e7 d.jpg
90b5ce1b30b65bc9 e.jpg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment