|
use std::env; |
|
use std::process; |
|
use std::io::{self, BufRead, BufReader}; |
|
use std::collections::{BTreeMap, HashMap}; |
|
|
|
type Hash = u64; |
|
|
|
type Distance = u32; |
|
|
|
enum BKTree { |
|
Empty, |
|
NonEmpty(Node), |
|
} |
|
|
|
struct Node { |
|
hash: Hash, |
|
children: BTreeMap<Distance, Node>, |
|
} |
|
|
|
impl BKTree { |
|
fn new() -> BKTree { |
|
BKTree::Empty |
|
} |
|
|
|
fn insert(&mut self, new: Hash) { |
|
use BKTree::*; |
|
match *self { |
|
Empty => *self = NonEmpty(Node::single(new)), |
|
NonEmpty(ref mut node) => node.insert(new), |
|
} |
|
} |
|
|
|
fn find(&self, needle: Hash, tolerance: Distance) -> Vec<Hash> { |
|
use BKTree::*; |
|
match self { |
|
&Empty => Vec::new(), |
|
&NonEmpty(ref node) => node.find(needle, tolerance), |
|
} |
|
} |
|
} |
|
|
|
impl Node { |
|
fn single(hash: Hash) -> Node { |
|
Node { |
|
hash: hash, |
|
children: BTreeMap::new(), |
|
} |
|
} |
|
|
|
fn insert(&mut self, new: Hash) { |
|
if self.hash != new { |
|
let new_dist = distance(new, self.hash); |
|
self.children |
|
.entry(new_dist) |
|
.or_insert(Node::single(new)) |
|
.insert(new) |
|
} |
|
} |
|
|
|
fn find(&self, needle: Hash, tolerance: Distance) -> Vec<Hash> { |
|
let mut result = Vec::new(); |
|
let needle_dist = distance(self.hash, needle); |
|
if needle_dist <= tolerance { |
|
result.push(self.hash) |
|
} |
|
for (&dist, ref tree) in &self.children { |
|
if dist + tolerance >= needle_dist && needle_dist + tolerance >= dist { |
|
let found = tree.find(needle, tolerance); |
|
for hash in found { |
|
result.push(hash) |
|
} |
|
} |
|
} |
|
result |
|
} |
|
} |
|
|
|
#[test] |
|
fn test_insertion_and_finding() { |
|
let mut t = BKTree::new(); |
|
t.insert(0x2af7); |
|
t.insert(0x6af7); |
|
for &(needle, dist, ref expected) in &[ |
|
(0x6af7, 0, vec![0x6af7]), |
|
(0x6bf7, 1, vec![0x6af7]), |
|
(0x6bf7, 2, vec![0x2af7, 0x6af7]), |
|
(0x6bf6, 1, vec![]), |
|
(0x6bf6, 2, vec![0x6af7]), |
|
(0x6bf6, 3, vec![0x2af7, 0x6af7]), |
|
] { |
|
let found = t.find(needle, dist); |
|
assert_eq!(&found, expected); |
|
} |
|
} |
|
|
|
fn to_hash(s: &str) -> Hash { |
|
Hash::from_str_radix(s, 16).expect("parsing hash") |
|
} |
|
|
|
fn distance(a: Hash, b: Hash) -> Distance { |
|
(a ^ b).count_ones() |
|
} |
|
|
|
fn find_by_hashes<'a>(mapping: &'a HashMap<Hash, Vec<String>>, hashes: &[Hash]) -> Vec<&'a String> { |
|
hashes |
|
.iter() |
|
.map(|hash| mapping.get(hash)) |
|
.filter(|opt| opt.is_some()) |
|
.flat_map(|opt| opt.expect("must be some")) |
|
.collect() |
|
} |
|
|
|
fn load_from_file(name: &str) -> (BKTree, HashMap<Hash, Vec<String>>) { |
|
use std::fs::*; |
|
let file = File::open(name).expect("opening file"); |
|
let mut tree = BKTree::new(); |
|
let mut files = HashMap::new(); |
|
for line in BufReader::new(file).lines().map(|l| l.unwrap()) { |
|
let chunks: Vec<&str> = line.splitn(2, ' ').collect(); |
|
let hash = to_hash(&chunks[0]); |
|
let value = String::from(chunks[1]); |
|
tree.insert(hash); |
|
let mut entry = files.entry(hash).or_insert_with(|| Vec::new()); |
|
entry.push(value); |
|
} |
|
(tree, files) |
|
} |
|
|
|
fn search(hashes_file: &str) { |
|
use std::str::FromStr; |
|
let tolerance = std::env::var("TOLERANCE") |
|
.map(|s| u32::from_str(&s)) |
|
.unwrap_or(Ok(12)) |
|
.expect("parsing TOLERANCE"); |
|
let (tree, files) = load_from_file(hashes_file); |
|
let stdin = io::stdin(); |
|
for line in stdin.lock().lines().map(|l| l.unwrap()) { |
|
assert!(line.len() == 16); |
|
let similar = tree.find(to_hash(&line), tolerance); |
|
if similar.len() > 1 { |
|
print!("{} ", line); |
|
for val in find_by_hashes(&files, &similar) { |
|
print!("'{}' ", val); |
|
} |
|
println!(""); |
|
} |
|
} |
|
} |
|
|
|
fn usage_and_exit(exitcode: i32) { |
|
eprintln!("usage: bk phash_output < needle_hashes"); |
|
process::exit(exitcode); |
|
} |
|
|
|
fn main() { |
|
let args: Vec<_> = env::args().skip(1).collect(); |
|
if args.len() != 1 { |
|
usage_and_exit(1); |
|
} |
|
match args[0].as_str() { |
|
"-h" => usage_and_exit(0), |
|
input => search(&input), |
|
} |
|
} |