Skip to content

Instantly share code, notes, and snippets.

@fabrizioc1
Created April 2, 2018 12:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fabrizioc1/c3466f8ddfb8846840e972c5ff919ab2 to your computer and use it in GitHub Desktop.
Save fabrizioc1/c3466f8ddfb8846840e972c5ff919ab2 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"lsh_results: 49\n",
"lsh_threshold: 0.8909\n",
"\n",
"seed: [0, 1, 0, 1, 1, 0, 0, 1, 1, 1]\n",
" using lsh:\n",
" match: (0.8571, [0, 1, 0, 1, 1, 1, 0, 1, 1, 1])\n",
" match: (0.6667, [1, 1, 0, 1, 1, 1, 1, 1, 1, 1])\n",
" match: (0.6250, [1, 1, 0, 0, 1, 0, 1, 1, 1, 1])\n",
" using search:\n",
" match: (0.8571, [0, 1, 0, 1, 1, 1, 0, 1, 1, 1])\n",
" match: (0.8333, [0, 1, 0, 1, 1, 0, 0, 1, 1, 0])\n",
" match: (0.8333, [0, 1, 0, 1, 1, 0, 0, 1, 1, 0])\n",
"seed: [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n",
" using lsh:\n",
" match: (0.8571, [1, 1, 1, 1, 1, 1, 0, 1, 0, 0])\n",
" using search:\n",
" match: (0.8571, [1, 1, 1, 1, 1, 1, 0, 1, 0, 0])\n",
"seed: [1, 0, 1, 0, 1, 1, 1, 0, 0, 1]\n",
" using lsh:\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
" match: (0.8333, [0, 0, 1, 0, 1, 1, 1, 0, 0, 1])\n",
" using search:\n",
" match: (0.8571, [1, 0, 1, 0, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8571, [1, 0, 1, 0, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
"seed: [1, 1, 0, 1, 1, 0, 1, 0, 1, 1]\n",
" using lsh:\n",
" match: (0.8571, [0, 1, 0, 1, 1, 0, 1, 0, 1, 1])\n",
" match: (0.7778, [1, 1, 1, 1, 1, 1, 1, 0, 1, 1])\n",
" match: (0.7143, [0, 1, 0, 0, 1, 0, 1, 0, 1, 1])\n",
" match: (0.7143, [1, 1, 0, 0, 1, 0, 0, 0, 1, 1])\n",
" using search:\n",
" match: (0.8571, [0, 1, 0, 1, 1, 0, 1, 0, 1, 1])\n",
" match: (0.7778, [1, 1, 1, 1, 1, 1, 1, 0, 1, 1])\n",
" match: (0.7778, [1, 1, 0, 1, 1, 1, 1, 1, 1, 1])\n",
" match: (0.7500, [1, 1, 0, 0, 1, 0, 1, 1, 1, 1])\n",
"seed: [1, 0, 0, 1, 1, 0, 1, 1, 0, 1]\n",
" using lsh:\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 1, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 1, 0, 1])\n",
" match: (0.8333, [0, 0, 0, 1, 1, 0, 1, 1, 0, 1])\n",
" using search:\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 1, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 1, 0, 1])\n",
" match: (0.8333, [1, 0, 0, 1, 0, 0, 1, 1, 0, 1])\n",
"seed: [0, 0, 1, 0, 0, 1, 0, 0, 1, 1]\n",
" using lsh:\n",
" match: (0.7500, [0, 0, 1, 0, 0, 0, 0, 0, 1, 1])\n",
" using search:\n",
" match: (0.8000, [0, 0, 1, 0, 0, 1, 1, 0, 1, 1])\n",
"seed: [0, 0, 1, 0, 1, 1, 1, 0, 1, 1]\n",
" using lsh:\n",
" match: (0.8571, [0, 0, 1, 1, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8571, [1, 0, 1, 0, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8571, [1, 0, 1, 0, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8333, [0, 0, 1, 0, 1, 0, 1, 0, 1, 1])\n",
" match: (0.7143, [1, 0, 1, 0, 1, 0, 1, 0, 1, 1])\n",
" match: (0.7143, [1, 0, 1, 0, 1, 0, 1, 0, 1, 1])\n",
" match: (0.6667, [1, 0, 1, 1, 1, 1, 1, 1, 1, 1])\n",
" match: (0.6250, [1, 0, 1, 1, 1, 0, 1, 0, 1, 1])\n",
" using search:\n",
" match: (0.8571, [1, 0, 1, 0, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8571, [0, 0, 1, 1, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8571, [1, 0, 1, 0, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8571, [0, 1, 1, 0, 1, 1, 1, 0, 1, 1])\n",
" match: (0.8333, [0, 0, 1, 0, 0, 1, 1, 0, 1, 1])\n",
" match: (0.8333, [0, 0, 1, 0, 0, 1, 1, 0, 1, 1])\n",
" match: (0.8333, [0, 0, 1, 0, 0, 1, 1, 0, 1, 1])\n",
" match: (0.8333, [0, 0, 1, 0, 1, 1, 1, 0, 0, 1])\n",
"seed: [1, 1, 0, 1, 1, 1, 0, 1, 0, 0]\n",
" using lsh:\n",
" match: (0.5714, [1, 1, 0, 0, 1, 0, 1, 1, 0, 0])\n",
" using search:\n",
" match: (0.8571, [1, 1, 1, 1, 1, 1, 0, 1, 0, 0])\n",
"seed: [0, 1, 1, 0, 0, 0, 1, 0, 0, 0]\n",
" using lsh:\n",
" match: (0.7500, [0, 1, 1, 0, 0, 1, 1, 0, 0, 0])\n",
" match: (0.7500, [0, 1, 1, 0, 0, 1, 1, 0, 0, 0])\n",
" using search:\n",
" match: (0.7500, [0, 1, 1, 0, 0, 1, 1, 0, 0, 0])\n",
" match: (0.7500, [0, 1, 1, 0, 0, 0, 1, 0, 1, 0])\n",
"seed: [1, 0, 1, 1, 1, 0, 1, 0, 0, 1]\n",
" using lsh:\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 1, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 1, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
" using search:\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 0, 1, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 1, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 1, 1, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
" match: (0.8571, [1, 0, 1, 1, 1, 1, 1, 0, 0, 1])\n",
"seed: [1, 1, 1, 1, 1, 0, 0, 0, 1, 1]\n",
" using lsh:\n",
" match: (0.8571, [1, 1, 1, 0, 1, 0, 0, 0, 1, 1])\n",
" match: (0.7778, [1, 1, 1, 1, 1, 1, 1, 0, 1, 1])\n",
" match: (0.7778, [1, 1, 1, 1, 1, 1, 0, 1, 1, 1])\n",
" match: (0.7778, [1, 1, 1, 1, 1, 1, 0, 1, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 0, 1, 0, 0, 1, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 0, 1, 0, 0, 1, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 0, 1, 1, 0, 0, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 0, 1, 0, 1, 0, 1, 1])\n",
" match: (0.6667, [0, 1, 1, 1, 1, 0, 1, 1, 1, 1])\n",
" match: (0.6250, [0, 1, 1, 0, 1, 0, 0, 1, 1, 1])\n",
" match: (0.6000, [1, 1, 1, 0, 1, 1, 1, 1, 1, 1])\n",
" match: (0.5556, [0, 1, 1, 0, 1, 1, 1, 0, 1, 1])\n",
" using search:\n",
" match: (0.8571, [1, 0, 1, 1, 1, 0, 0, 0, 1, 1])\n",
" match: (0.8571, [1, 1, 1, 0, 1, 0, 0, 0, 1, 1])\n",
" match: (0.7778, [1, 1, 1, 1, 1, 1, 1, 0, 1, 1])\n",
" match: (0.7778, [1, 1, 1, 1, 1, 1, 0, 1, 1, 1])\n",
" match: (0.7778, [1, 1, 1, 1, 1, 1, 0, 1, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 0, 1, 0, 0, 1, 1, 1])\n",
" match: (0.7500, [1, 0, 1, 1, 1, 0, 1, 0, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 0, 1, 0, 1, 0, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 0, 1, 0, 0, 1, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 1, 0, 1, 0, 0, 1, 1])\n",
" match: (0.7500, [1, 1, 1, 0, 1, 1, 0, 0, 1, 1])\n",
" match: (0.7143, [1, 1, 1, 1, 0, 0, 0, 0, 0, 1])\n",
"seed: [0, 0, 0, 1, 0, 1, 0, 1, 0, 1]\n",
" using lsh:\n",
" match: (0.8000, [0, 1, 0, 1, 0, 1, 0, 1, 0, 1])\n",
" using search:\n",
" match: (0.8000, [0, 0, 0, 1, 0, 1, 1, 1, 0, 1])\n",
"seed: [0, 0, 0, 1, 0, 1, 1, 1, 1, 1]\n",
" using lsh:\n",
" match: (1.0000, [0, 0, 0, 1, 0, 1, 1, 1, 1, 1])\n",
" match: (0.8571, [0, 1, 0, 1, 0, 1, 1, 1, 1, 1])\n",
" match: (0.7143, [0, 1, 0, 1, 0, 0, 1, 1, 1, 1])\n",
" using search:\n",
" match: (1.0000, [0, 0, 0, 1, 0, 1, 1, 1, 1, 1])\n",
" match: (0.8571, [0, 1, 0, 1, 0, 1, 1, 1, 1, 1])\n",
" match: (0.8333, [0, 0, 0, 1, 0, 1, 1, 1, 0, 1])\n",
"seed: [1, 1, 0, 1, 1, 0, 0, 0, 0, 1]\n",
" using lsh:\n",
" match: (0.8000, [1, 1, 0, 0, 1, 0, 0, 0, 0, 1])\n",
" using search:\n",
" match: (0.8000, [1, 1, 0, 0, 1, 0, 0, 0, 0, 1])\n"
]
}
],
"source": [
"import json\n",
"import random\n",
"import numpy as np\n",
"import functools\n",
"from collections import defaultdict, OrderedDict\n",
"\n",
"LSH_BAND_COUNT = 4\n",
" \n",
"def minhash(v, a, b, p):\n",
" row_numbers = np.arange(len(v), dtype = np.int)\n",
" hash_values = (a * row_numbers + b) % p\n",
" minhash_values = [hash_value for hash_value, feature in zip(hash_values, v) if feature]\n",
" if len(minhash_values) > 0:\n",
" minhash_value = min(minhash_values)\n",
" else:\n",
" minhash_value = 0\n",
" return minhash_value\n",
"\n",
"def get_lsh(sig, b, r):\n",
" lsh = []\n",
" for i, band in enumerate(range(b)):\n",
" lsh_hash_input = tuple(sig[i * r:i * r + r])\n",
" lsh_hash_value = hash(lsh_hash_input)\n",
" lsh.append(lsh_hash_value)\n",
" return lsh\n",
"\n",
"def jaccard(x, y):\n",
" a = np.array(x)\n",
" b = np.array(y)\n",
" union = float(sum(a | b)) \n",
" intersection = float(sum(a & b))\n",
" return round(intersection / union, 4)\n",
"\n",
"def create_signature(features, hash_functions):\n",
" return [hash_function(features) for hash_function in hash_functions]\n",
"\n",
"def find_similar_features(band_count, hash_functions, source_features, target_features):\n",
" r_value = len(hash_functions) / band_count \n",
" \n",
" source_sigs = [create_signature(item, hash_functions) for item in source_features]\n",
" target_sigs = [create_signature(item, hash_functions) for item in target_features]\n",
"\n",
" source_lsh_values = [(i, get_lsh(sig, band_count, r_value)) for i, sig in enumerate(source_sigs)]\n",
" target_lsh_values = [(i, get_lsh(sig, band_count, r_value)) for i, sig in enumerate(target_sigs)]\n",
"\n",
" results = set() \n",
" for i, source_lsh_hashes in source_lsh_values:\n",
" for j, target_lsh_hashes in target_lsh_values:\n",
" common_lsh_hashes = set(source_lsh_hashes) & set(target_lsh_hashes)\n",
" if common_lsh_hashes:\n",
" results.add((i, j))\n",
" return results\n",
"\n",
"def read_json(json_path):\n",
" with open(json_path) as json_file: \n",
" json_data = json.load(json_file)\n",
" return json_data\n",
"\n",
"config = read_json('/tmp/hash_function_seeds.json')\n",
"seeds = config['seeds']\n",
"p_value = config['p_value']\n",
"r_value = len(hash_functions_seeds) / LSH_BAND_COUNT\n",
"lsh_threshold = (1.0 / BANDS_COUNT) ** (1.0 / r_value)\n",
"hash_functions = [functools.partial(minhash, a = s[0], b = s[1], p = p_value) for s in seeds]\n",
"\n",
"seed_features = np.array(read_json('/tmp/seeds.json'))\n",
"target_features = np.array(read_json('/tmp/features.json'))\n",
"similar_features = find_similar_features(LSH_BAND_COUNT, hash_functions, seed_features, target_features)\n",
"\n",
"lsh_results = defaultdict(list)\n",
"for i, j in similar_features:\n",
" lsh_results[i].append(j)\n",
"\n",
"print(\"lsh_results: %d\" % len(similar_features))\n",
"print(\"lsh_threshold: %.4f\\n\" % lsh_threshold)\n",
"\n",
"for seed_index, target_indices in lsh_results.items():\n",
" seed = seed_features[seed_index]\n",
" print(\"seed: %r\" % seed.tolist()) \n",
" lsh_matches = []\n",
" for i in target_indices:\n",
" target = target_features[i]\n",
" similarity = jaccard(seed, target)\n",
" lsh_match = (similarity, target.tolist())\n",
" lsh_matches.append(lsh_match)\n",
" \n",
" print(\" using lsh:\")\n",
" for (s, t) in sorted(lsh_matches, key=lambda (s, t): s, reverse=True):\n",
" print(\" match: (%.4f, %r)\" % (s, t))\n",
"\n",
" search_matches = []\n",
" for target in target_features:\n",
" similarity = jaccard(seed, target)\n",
" search_match = (similarity, target.tolist())\n",
" search_matches.append(search_match)\n",
" \n",
" print(\" using search:\")\n",
" search_matches = sorted(search_matches, key=lambda (s, t): s, reverse=True)\n",
" search_matches = search_matches[:len(lsh_matches)]\n",
" for (s, t) in search_matches:\n",
" print(\" match: (%.4f, %r)\" % (s, t))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment