Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
It All Looks the Same to Me

In which Rust, perceptual hashing, and BK-trees come together.

cargo test
cargo build --release
< needle_hashes.txt ./target/release/bk phash_output.txt
< needle_hashes.txt TOLERANCE=10 ./target/release/bk phash_output.txt
[package]
name = "bk"
version = "0.1.0"
authors = ["Jan Stępień"]
use std::env;
use std::process;
use std::io::{self, BufRead, BufReader};
use std::collections::{BTreeMap, HashMap};
type Hash = u64;
type Distance = u32;
enum BKTree {
Empty,
NonEmpty(Node),
}
struct Node {
hash: Hash,
children: BTreeMap<Distance, Node>,
}
impl BKTree {
fn new() -> BKTree {
BKTree::Empty
}
fn insert(&mut self, new: Hash) {
use BKTree::*;
match *self {
Empty => *self = NonEmpty(Node::single(new)),
NonEmpty(ref mut node) => node.insert(new),
}
}
fn find(&self, needle: Hash, tolerance: Distance) -> Vec<Hash> {
use BKTree::*;
match self {
&Empty => Vec::new(),
&NonEmpty(ref node) => node.find(needle, tolerance),
}
}
}
impl Node {
fn single(hash: Hash) -> Node {
Node {
hash: hash,
children: BTreeMap::new(),
}
}
fn insert(&mut self, new: Hash) {
if self.hash != new {
let new_dist = distance(new, self.hash);
self.children
.entry(new_dist)
.or_insert(Node::single(new))
.insert(new)
}
}
fn find(&self, needle: Hash, tolerance: Distance) -> Vec<Hash> {
let mut result = Vec::new();
let needle_dist = distance(self.hash, needle);
if needle_dist <= tolerance {
result.push(self.hash)
}
for (&dist, ref tree) in &self.children {
if dist + tolerance >= needle_dist && needle_dist + tolerance >= dist {
let found = tree.find(needle, tolerance);
for hash in found {
result.push(hash)
}
}
}
result
}
}
#[test]
fn test_insertion_and_finding() {
let mut t = BKTree::new();
t.insert(0x2af7);
t.insert(0x6af7);
for &(needle, dist, ref expected) in &[
(0x6af7, 0, vec![0x6af7]),
(0x6bf7, 1, vec![0x6af7]),
(0x6bf7, 2, vec![0x2af7, 0x6af7]),
(0x6bf6, 1, vec![]),
(0x6bf6, 2, vec![0x6af7]),
(0x6bf6, 3, vec![0x2af7, 0x6af7]),
] {
let found = t.find(needle, dist);
assert_eq!(&found, expected);
}
}
fn to_hash(s: &str) -> Hash {
Hash::from_str_radix(s, 16).expect("parsing hash")
}
fn distance(a: Hash, b: Hash) -> Distance {
(a ^ b).count_ones()
}
fn find_by_hashes<'a>(mapping: &'a HashMap<Hash, Vec<String>>, hashes: &[Hash]) -> Vec<&'a String> {
hashes
.iter()
.map(|hash| mapping.get(hash))
.filter(|opt| opt.is_some())
.flat_map(|opt| opt.expect("must be some"))
.collect()
}
fn load_from_file(name: &str) -> (BKTree, HashMap<Hash, Vec<String>>) {
use std::fs::*;
let file = File::open(name).expect("opening file");
let mut tree = BKTree::new();
let mut files = HashMap::new();
for line in BufReader::new(file).lines().map(|l| l.unwrap()) {
let chunks: Vec<&str> = line.splitn(2, ' ').collect();
let hash = to_hash(&chunks[0]);
let value = String::from(chunks[1]);
tree.insert(hash);
let mut entry = files.entry(hash).or_insert_with(|| Vec::new());
entry.push(value);
}
(tree, files)
}
fn search(hashes_file: &str) {
use std::str::FromStr;
let tolerance = std::env::var("TOLERANCE")
.map(|s| u32::from_str(&s))
.unwrap_or(Ok(12))
.expect("parsing TOLERANCE");
let (tree, files) = load_from_file(hashes_file);
let stdin = io::stdin();
for line in stdin.lock().lines().map(|l| l.unwrap()) {
assert!(line.len() == 16);
let similar = tree.find(to_hash(&line), tolerance);
if similar.len() > 1 {
print!("{} ", line);
for val in find_by_hashes(&files, &similar) {
print!("'{}' ", val);
}
println!("");
}
}
}
fn usage_and_exit(exitcode: i32) {
eprintln!("usage: bk phash_output < needle_hashes");
process::exit(exitcode);
}
fn main() {
let args: Vec<_> = env::args().skip(1).collect();
if args.len() != 1 {
usage_and_exit(1);
}
match args[0].as_str() {
"-h" => usage_and_exit(0),
input => search(&input),
}
}
01f0fc9eff3b01c0
15066e39914e17f7
11e9dcfe3e3f0180 a.jpg
01f0fcdebf3b01c0 b.jpg
15066e39914e17f7 c.jpg
35066a79990aa7e7 d.jpg
90b5ce1b30b65bc9 e.jpg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.