Skip to content

Instantly share code, notes, and snippets.

@fabrizioc1
Created March 29, 2018 06:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fabrizioc1/e14698bbaf58c29845bb623e545e4d35 to your computer and use it in GitHub Desktop.
Save fabrizioc1/e14698bbaf58c29845bb623e545e4d35 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"hash_functions = [\n",
" lambda x: (x + 1) % 5,\n",
" lambda x: (3*x + 1) % 5\n",
"]\n",
"\n",
"features_list = np.array([\n",
" [1, 0, 0, 1, 0],\n",
" [0, 0, 1, 0, 0],\n",
" [0, 1, 0, 1, 1],\n",
" [1, 0, 1, 1, 0]\n",
"])\n",
"\n",
"def create_signature(features):\n",
" signature = []\n",
" for hash_function in hash_functions:\n",
" hash_values = []\n",
" for i, feature in enumerate(features):\n",
" if feature: \n",
" hash_values.append(hash_function(i)) \n",
" signature.append(min(hash_values))\n",
" return signature\n",
" \n",
"signatures = [create_signature(features) for features in features_list]"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[[1, 0], [3, 2], [0, 0], [1, 0]]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"signatures"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"import functools\n",
"\n",
"def minhash(v, a, b, p):\n",
" row_numbers = np.arange(len(v), dtype = np.int)\n",
" hash_values = (a * row_numbers + b) % p\n",
" return min([hash_value for hash_value, feature in zip(hash_values, v) if feature])\n",
" \n",
"seeds = np.array([[1, 1], [3, 1]])\n",
"hash_functions = [functools.partial(minhash, a = s[0], b = s[1], p = 5) for s in seeds]\n",
" \n",
"def create_signature_2(features):\n",
" return [hash_function(features) for hash_function in hash_functions]\n",
"\n",
"signatures_2 = [create_signature_2(features) for features in features_list]"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[[1, 0], [3, 2], [0, 0], [1, 0]]"
]
},
"execution_count": 84,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"signatures_2"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"def jaccard(a, b):\n",
" union = float(sum(a | b)) \n",
" intersection = float(sum(a & b))\n",
" return (intersection / union)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"seed = [1 0 0 1 0] other = [1 0 0 1 0] jaccard = 1.0\n",
"seed = [1 0 0 1 0] other = [0 0 1 0 0] jaccard = 0.0\n",
"seed = [1 0 0 1 0] other = [0 1 0 1 1] jaccard = 0.25\n",
"seed = [1 0 0 1 0] other = [1 0 1 1 0] jaccard = 0.666666666667\n"
]
}
],
"source": [
"for features in features_list:\n",
" print(\"seed = {} other = {} jaccard = {}\").format( features_list[0], features, jaccard(features, features_list[0]))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment