Skip to content

Instantly share code, notes, and snippets.

@fabrizioc1
Created March 29, 2018 09:00
Show Gist options
  • Save fabrizioc1/ef231dbe280b3b25b341bc163f8c6ee2 to your computer and use it in GitHub Desktop.
Save fabrizioc1/ef231dbe280b3b25b341bc163f8c6ee2 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"import numpy as np\n",
"import functools\n",
"from collections import defaultdict, OrderedDict\n",
"\n",
"BANDS_COUNT = 4\n",
"FEATURES_COUNT = 10\n",
"SAMPLES_COUNT = 200\n",
"HASH_FUNCTIONS_COUNT = 50\n",
"\n",
"P_VALUE = 15307\n",
"R_VALUE = HASH_FUNCTIONS_COUNT / BANDS_COUNT\n",
"\n",
"EXPECTED_JACCARD_SIMILARITY = (1.0 / BANDS_COUNT) ** (1.0 / R_VALUE)\n",
"\n",
"def generate_features():\n",
" return [int(round(random.random())) for i in range(FEATURES_COUNT)]\n",
"\n",
"def minhash(v, a, b, p):\n",
" row_numbers = np.arange(len(v), dtype = np.int)\n",
" hash_values = (a * row_numbers + b) % p\n",
" return min([hash_value for hash_value, feature in zip(hash_values, v) if feature])\n",
"\n",
"def get_lsh(sig, b, r):\n",
" lsh = []\n",
" for i, band in enumerate(range(b)):\n",
" lsh_hash_input = tuple(sig[i * r:i * r + r])\n",
" lsh_hash_value = hash(lsh_hash_input)\n",
" lsh.append(lsh_hash_value)\n",
" return lsh\n",
"\n",
"hash_functions_seeds = np.random.randint(0, P_VALUE + 1, size=(HASH_FUNCTIONS_COUNT,2))\n",
"hash_functions = [functools.partial(minhash, a = s[0], b = s[1], p = P_VALUE) for s in hash_functions_seeds]\n",
"\n",
"def create_signature(features):\n",
" return [hash_function(features) for hash_function in hash_functions]\n",
"\n",
"features = [generate_features() for i in range(SAMPLES_COUNT)]\n",
"signatures = [create_signature(item) for item in features]\n",
"lsh_values = [get_lsh(signature, BANDS_COUNT, R_VALUE) for signature in signatures]\n",
"lsh_groups = defaultdict(list)\n",
"\n",
"for i, lsh_hashes in enumerate(lsh_values):\n",
" for lsh_hash in lsh_hashes:\n",
" lsh_groups[lsh_hash].append(i)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" 2205600188479422073: array([ 6, 27, 33, 40, 46, 92, 103, 154, 193, 194])\n",
" -293373667914338174: array([ 6, 46, 52, 75, 78, 181])\n",
" 6433947695569687150: array([46, 60, 72, 75, 78, 90])\n",
"-8992488164808727881: array([ 7, 88, 147, 181, 184])\n",
"-1583294660279688411: array([ 5, 69, 94, 115, 187])\n",
" 1091462665699973464: array([ 53, 127, 142, 155])\n",
"-1192773111397545315: array([ 7, 40, 88, 194])\n",
" 579199865553961782: array([ 30, 61, 136, 161])\n",
" 5998665160725841151: array([115, 155, 189, 199])\n",
"-2351208631096763025: array([ 40, 131, 171, 194])\n",
" 3874079993700623940: array([46, 72, 75, 78])\n",
" 3513253610246951000: array([ 2, 93, 100])\n",
" 555045802841401439: array([104, 138, 150])\n",
" 3767034234313275501: array([ 22, 49, 181])\n",
"-7462136130136969084: array([ 22, 49, 166])\n",
" 7665830942910711954: array([ 40, 131, 194])\n",
"-8294236252542742205: array([ 73, 143, 192])\n",
" 6652489243394486880: array([ 27, 92, 184])\n",
" 839235362626277176: array([ 14, 68, 129])\n",
" 8677710645831685057: array([54, 60, 90])\n",
"-7232756772244003504: array([ 99, 109, 171])\n",
" 7355392977796261224: array([ 17, 47, 190])\n",
" 7841759836370251155: array([ 43, 157, 170])\n",
" 4114697109078320573: array([ 28, 89, 114])\n",
"-1202723475179721044: array([ 75, 78, 131])\n",
" 7566777267531495428: array([ 63, 164])\n",
"-8001745828403279851: array([104, 138])\n",
"-4493634870318057437: array([104, 138])\n",
"-3666662695838666708: array([25, 84])\n",
" 500201670097612864: array([ 1, 191])\n",
" 7744652499893289033: array([ 77, 117])\n",
"-1062239565333665711: array([20, 92])\n",
" 8684379579705602187: array([ 58, 182])\n",
"-4232570487896199021: array([ 23, 186])\n",
"-3289256988570965834: array([189, 199])\n",
"-9056557887510370113: array([42, 79])\n",
" 1356955176248561880: array([ 19, 160])\n",
" 3293369854524895458: array([114, 163])\n",
" 8375091059921178859: array([ 43, 142])\n",
"-3150870824033548019: array([128, 177])\n",
"-9047136102830589037: array([128, 177])\n",
" 7928695070452670767: array([109, 148])\n",
" 143034152770572600: array([ 39, 139])\n",
"-6377191291371392709: array([21, 31])\n",
" 4550284658373101896: array([36, 97])\n",
"-4658215217759586589: array([139, 165])\n",
"-2052662603517077074: array([42, 79])\n",
"-3076481155236376502: array([144, 159])\n",
"-9141194817926143533: array([42, 79])\n",
"-5240394749826909731: array([ 85, 123])\n",
" 4085755310669853155: array([ 63, 164])\n",
" 6733415682263964164: array([ 73, 143])\n",
"-4273136723810653686: array([ 19, 160])\n",
" 3504891779356877342: array([25, 48])\n",
" 3601542111197950524: array([115, 155])\n",
"-5281232378328231156: array([15, 57])\n",
" 386144237414396507: array([ 63, 164])\n",
" 475587000573575822: array([ 28, 114])\n",
"-3201212245962862820: array([ 27, 103])\n",
" 930662032798284461: array([ 33, 193])\n",
" 6666828971517188790: array([153, 195])\n",
" 7912903131138812614: array([100, 134])\n",
" 5418330124991193812: array([128, 177])\n",
"-3905098643006659861: array([104, 138])\n",
" 2980416597242446603: array([ 68, 129])\n",
" 8893003609165269813: array([87, 95])\n",
" 48310002981104439: array([128, 177])\n",
" 4710239764016286564: array([21, 31])\n",
" 3164545370979771245: array([87, 95])\n",
" -109894705126855826: array([ 49, 181])\n",
" 7426088305948634947: array([ 54, 182])\n",
" -589384610234927719: array([120, 152])\n",
"-7836774159975648332: array([21, 31])\n",
" 1229954841087990710: array([121, 197])\n",
" 2318742733614046183: array([ 44, 173])\n",
" -915885889521237007: array([ 30, 161])\n",
"-7828242008895237129: array([ 73, 143])\n",
" 234202921610187769: array([121, 197])\n",
" 6653417723990723583: array([36, 97])\n",
"-8815170311253279743: array([ 29, 195])\n",
" 2258524143000740901: array([105, 142])\n",
"-8327014272294194135: array([ 7, 88])\n",
" -82129828837317557: array([135, 152])\n",
"-5423690484305083308: array([16, 74])\n",
"-5623536378234924179: array([20, 92])\n",
"-5811225851769185069: array([ 63, 164])\n",
" 2751777353908538590: array([ 82, 174])\n",
" 4735341379683161327: array([36, 97])\n",
"-6962422245847638782: array([42, 79])\n",
" 58432453027945732: array([ 1, 191])\n",
"-2753579718058203869: array([ 68, 129])\n",
" 7148708469846930726: array([ 9, 37])\n",
"-8443591862526634700: array([127, 187])\n",
" 6426054618535267637: array([34, 38])\n",
" 5530200579228814651: array([115, 155])\n",
" 6655769329119425879: array([50, 65])\n",
"-5238270421593760411: array([ 19, 160])\n",
"-4794694382509599346: array([ 19, 160])\n",
" 2052974365789314447: array([ 20, 126])\n",
" 7804828260577234339: array([ 93, 100])\n",
" 4249737759287454654: array([144, 159])\n",
" 7365161804022537646: array([ 1, 191])\n",
"-6044889568500240944: array([ 26, 184])\n",
" 1748773958677861862: array([87, 95])\n",
"-1551582182362237460: array([ 6, 67])\n",
" 1019096898366686729: array([144, 159])\n",
" 7260188492159997472: array([ 11, 169])\n",
"-2075175704133841370: array([20, 72])\n",
" 3526974295576577603: array([ 66, 145])\n",
" 3967230403692721774: array([16, 91])\n",
" 1881566100384693872: array([ 30, 161])\n",
"-2768406792686254477: array([ 17, 190])\n",
"-2313833380990450054: array([157, 170])\n",
"-3945310022520822145: array([ 68, 129])\n",
"-2931558749729120620: array([144, 159])\n",
" 4903980762326118038: array([ 1, 191])\n",
"-7726270821643878756: array([60, 90])\n",
"-5984186889231780082: array([145, 192])\n",
"-6139815673108773130: array([121, 197])\n",
" 6401441519760711427: array([87, 95])\n",
"-2452329017771137255: array([21, 31])\n",
"-2534152896363364578: array([ 17, 190])\n",
" 417195753650030367: array([ 7, 88])\n",
"-2547720138243932384: array([ 99, 171])\n",
"-1937967631339722975: array([ 83, 119])\n",
" 5245751735376160568: array([50, 94])\n",
"-5201688883304915139: array([36, 97])\n",
"-4062596501856780650: array([ 30, 161])\n",
" 7212521011855077273: array([45, 60])\n",
" 3094875525283700635: array([ 27, 103])\n",
" 2990549145342107298: array([121, 197])\n",
"-7262195728430116913: array([ 94, 135])\n",
"-7261001916735635493: array([ 0, 162])\n",
" 6656383950029170688: array([153])\n",
" 124288732063491755: array([38])\n",
" 7125557924628856837: array([101])\n",
" 7568244693587707915: array([98])\n",
" 745994192663478290: array([25])\n",
"-9051975617844189164: array([167])\n",
"-4426052358267971555: array([22])\n",
"-2237114428101296094: array([147])\n",
"-3858738888899993564: array([198])\n",
"-6861004988124237781: array([125])\n",
" -132602197444896723: array([35])\n",
"-5033570728704487377: array([108])\n",
" 3449776831238815795: array([118])\n",
"-1865884574543527878: array([65])\n",
" 442079271442876478: array([62])\n",
" 4017856111385352258: array([135])\n",
" 3940649802140823619: array([85])\n",
"-2076667153503807414: array([74])\n",
"-8735431336053663664: array([156])\n",
" 3360476915320410195: array([56])\n",
" 6718747413344495700: array([170])\n",
"-5776793933286223782: array([166])\n",
"-7420700761337233312: array([132])\n",
" -183997577986824094: array([106])\n",
" 7907886545473218723: array([178])\n",
"-2264086742604622117: array([149])\n",
"-4621770206377455505: array([64])\n",
" 8356750601668149360: array([130])\n",
" 2240938441562450033: array([169])\n",
"-2890893318258470798: array([176])\n",
"-7482513904241659782: array([18])\n",
"-3692298279206174596: array([58])\n",
"-4328940047600357250: array([55])\n",
" 8440231523746394239: array([43])\n",
"-1728148994936874645: array([187])\n",
" 9013369159536828556: array([167])\n",
" 5841255163593928856: array([15])\n",
"-1016490533131626333: array([52])\n",
"-1473050683191181146: array([11])\n",
"-6445605819827339096: array([163])\n",
"-7157895394124142422: array([70])\n",
" 4181151439310229687: array([133])\n",
" 5058093861449447612: array([100])\n",
"-1799791339882698558: array([18])\n",
" 8946424683283619531: array([3])\n",
"-2336735132121984828: array([189])\n",
" 8225121759686627526: array([174])\n",
" 8430603485770592455: array([44])\n",
"-1369150210925780792: array([106])\n",
" 5976739807815327948: array([24])\n",
"-5647395210106080563: array([77])\n",
" 6669104386789227556: array([178])\n",
" -958430500537736998: array([190])\n",
" 3430295871403630820: array([174])\n",
" 5179467655960185066: array([116])\n",
"-7100313032081053460: array([150])\n",
"-1434585361362724622: array([86])\n",
" -415448199297552141: array([45])\n",
" 1707377051514836009: array([35])\n",
"-6230694415272469804: array([185])\n",
"-6311207724808634117: array([168])\n",
" 5893291976663525628: array([18])\n",
"-1129680498814093779: array([64])\n",
" -132709506453470975: array([195])\n",
"-7313963154409527036: array([140])\n",
" 7637876774259067158: array([10])\n",
"-3902238923003920096: array([120])\n",
" 2630395823147133219: array([131])\n",
" 3462808238835726630: array([66])\n",
"-6317311950366803673: array([32])\n",
" 6829219105907878185: array([13])\n",
" 8277243630643870002: array([73])\n",
" 3966317779912401207: array([162])\n",
" -37541616112631491: array([80])\n",
"-3886869675837830843: array([41])\n",
"-3673733359649115824: array([82])\n",
"-1335083021207858859: array([76])\n",
" 8643165270426153304: array([76])\n",
"-1199414188993495665: array([169])\n",
" 4417007871771529567: array([44])\n",
" 8822042201672911202: array([151])\n",
" -230747175354394268: array([15])\n",
" 164256269303681387: array([122])\n",
" 7472502300363413877: array([135])\n",
"-5188269969636431492: array([86])\n",
"-3147150646734233209: array([187])\n",
" 5385860683517059298: array([141])\n",
" 9108988157804938306: array([172])\n",
"-3180630757609383538: array([183])\n",
"-7854503421263134312: array([105])\n",
"-2067144570228213353: array([49])\n",
" -300042603680847464: array([103])\n",
" 5274025388623268249: array([99])\n",
"-8993202717039412837: array([156])\n",
" 7052512758585803169: array([80])\n",
"-7406484384523273817: array([199])\n",
" 2079199339091288492: array([178])\n",
"-2930607615694831179: array([3])\n",
" 8056244613007763899: array([134])\n",
"-1051882718282296245: array([147])\n",
"-6591694579880675669: array([179])\n",
" 4406815808565348809: array([4])\n",
" 5409101571668226507: array([175])\n",
"-6046458270531866155: array([89])\n",
" 250217051066921433: array([132])\n",
" 579677869784594910: array([140])\n",
"-2130220299120227871: array([25])\n",
"-9213396590073212446: array([139])\n",
" 6306465624251523556: array([81])\n",
" -137319441992863663: array([29])\n",
"-5541556113699698199: array([101])\n",
"-6874334058494983701: array([85])\n",
" 238791727852900852: array([151])\n",
" 8658135238641842678: array([101])\n",
"-7486290265052278281: array([39])\n",
"-1595448428420578823: array([55])\n",
" 617402301110309370: array([189])\n",
" 3375600277067272700: array([33])\n",
"-4862702292558291883: array([126])\n",
" 4594271836267069957: array([53])\n",
" 1283938094778192395: array([157])\n",
"-4321242349526064624: array([66])\n",
"-5472073222861870572: array([74])\n",
" 7743135235028226582: array([82])\n",
"-8037207126246186470: array([57])\n",
" 1125061874313681436: array([184])\n",
"-6501896472843443681: array([154])\n",
"-4770548666346108382: array([86])\n",
"-2365004445977976285: array([186])\n",
" 1035969431151376937: array([13])\n",
" 2665520474892108338: array([134])\n",
"-6258131871189345740: array([111])\n",
"-2338020462188580290: array([188])\n",
" 5353696595400198723: array([173])\n",
" 7632217757976062533: array([149])\n",
"-2502325116709299638: array([178])\n",
" 246000138573580879: array([59])\n",
" 3644946109046170193: array([28])\n",
" 9119666000450515541: array([13])\n",
"-1053387176121220513: array([126])\n",
" 4793775656197474918: array([136])\n",
"-3990743830500255127: array([62])\n",
" 2337161978436760171: array([61])\n",
" 653901389515543158: array([196])\n",
" 8037414413771803255: array([98])\n",
" 7320725589806369400: array([117])\n",
"-9099961661953471868: array([84])\n",
"-5608793285651769578: array([198])\n",
"-8860597611814849913: array([69])\n",
" 4966021376188336777: array([44])\n",
" 5378195388883352036: array([112])\n",
"-2957015676307146643: array([26])\n",
"-7999794637459277163: array([176])\n",
" 2664909748852542102: array([83])\n",
"-1640613327199853929: array([56])\n",
"-5158433823961890705: array([116])\n",
" 6940865484258228892: array([137])\n",
" 7836099322473341598: array([50])\n",
" 7193582558265781019: array([116])\n",
" 1667546571251788456: array([12])\n",
"-3945395870425375345: array([196])\n",
"-6656026884250580296: array([32])\n",
" -150543768567815052: array([172])\n",
" 4821018638228775612: array([47])\n",
" 270355158265449157: array([137])\n",
"-8238964857802988851: array([29])\n",
"-4241249964876623153: array([110])\n",
" 5187898721167395536: array([33])\n",
" 8819239847967789779: array([24])\n",
"-8139003207015392553: array([109])\n",
" -865414334132874533: array([37])\n",
" 3350007176694158047: array([35])\n",
"-6161089184082709782: array([2])\n",
" -37157879231177998: array([141])\n",
" -112483896246488332: array([130])\n",
" 5218403096130013952: array([23])\n",
" 4807866994571780873: array([55])\n",
"-4197387596037309686: array([16])\n",
"-3536494762183216372: array([12])\n",
" 6374317385946753806: array([139])\n",
"-8629975920907674835: array([152])\n",
" 1614343053169228561: array([146])\n",
" 1530772190531693332: array([126])\n",
" 2746477438028800789: array([112])\n",
" 4113446948448799512: array([195])\n",
"-3890323990575031525: array([70])\n",
" 3180147325119941405: array([23])\n",
"-8751670243967487198: array([54])\n",
" 333087456651919506: array([37])\n",
"-6184815462401268949: array([119])\n",
" -28648390352520403: array([45])\n",
" 2486186484910307805: array([105])\n",
" 5957120637512301363: array([67])\n",
"-7827334146838129865: array([89])\n",
" 8907504018088702785: array([141])\n",
"-3963400855918914282: array([158])\n",
" 7763809368974052165: array([101])\n",
" -569271407919830198: array([53])\n",
" 1600153771491390284: array([52])\n",
" 8035215495708752722: array([133])\n",
" 7905630588259496788: array([70])\n",
" 6441359575392605017: array([102])\n",
"-3403704569155910821: array([158])\n",
" 3519556878717217630: array([107])\n",
"-5669493754904896671: array([48])\n",
"-7878241400157510170: array([61])\n",
" 8984951764641667943: array([175])\n",
"-5944647206404478103: array([51])\n",
" 250217051003962218: array([151])\n",
" 8443398067577418215: array([167])\n",
"-5125357684153730192: array([0])\n",
" 1722922626157945713: array([112])\n",
"-4420247073406942350: array([26])\n",
"-7630058322384805004: array([186])\n",
"-8038499124743929656: array([183])\n",
"-4855331284019158144: array([171])\n",
" 6865380509904788353: array([24])\n",
"-3288251468621919765: array([84])\n",
" 725973669636912005: array([6])\n",
" -93067228570989687: array([199])\n",
" 5634509808248855435: array([180])\n",
" 6884988838148057998: array([123])\n",
"-8176164134134547557: array([71])\n",
" 5920239014280936348: array([162])\n",
" 855859477561899937: array([168])\n",
"-6854125407767616599: array([96])\n",
"-5970357783904910418: array([81])\n",
"-3039566883162389583: array([196])\n",
" 7720472991117031924: array([166])\n",
" 286003955833288482: array([71])\n",
"-1426947053973036098: array([108])\n",
"-8341482939780232256: array([108])\n",
" 4574858606481603522: array([145])\n",
"-4757924113398424628: array([45])\n",
"-5308555444409252912: array([2])\n",
"-5342454525477965661: array([109])\n",
" 1339591309858300887: array([34])\n",
" 8008575642901736955: array([67])\n",
"-1471527048220199960: array([188])\n",
" 546848063019748329: array([81])\n",
" 3904390807448433643: array([71])\n",
" 6438928080584483823: array([136])\n",
" 372439331778104309: array([173])\n",
"-1739053556711681537: array([146])\n",
" 8674914918190930947: array([9])\n",
"-2633195747646845944: array([170])\n",
" -639568396396370922: array([3])\n",
" 6663923492114713623: array([41])\n",
"-3945310018935962600: array([154])\n",
"-8789325158505769950: array([91])\n",
" 5144173326580546595: array([142])\n",
"-6194053870494743516: array([67])\n",
" 5247193473611752493: array([65])\n",
"-4765060469204194251: array([179])\n",
"-6720301567239343045: array([113])\n",
" 7092625020756618306: array([56])\n",
" 2060534815886095432: array([72])\n",
" 1873652293805800524: array([110])\n",
" 1046242848060617813: array([64])\n",
"-6793861894081844133: array([125])\n",
"-7867716218601796512: array([185])\n",
"-7823471018575321584: array([83])\n",
"-2141074356777450397: array([64])\n",
" 1703375105185426541: array([124])\n",
"-5885034433441000333: array([154])\n",
"-2178136760804889484: array([90])\n",
" 1926663213551864955: array([107])\n",
"-6583636023964498817: array([193])\n",
" 2719930769932831874: array([117])\n",
"-5064657388235033465: array([24])\n",
"-8141585468083166062: array([5])\n",
" 8591824546851900270: array([0])\n",
"-2029350506465436520: array([137])\n",
" -716187173832998374: array([133])\n",
"-3001771648294120282: array([116])\n",
"-2155589709559599961: array([124])\n",
" -7734207521893208: array([112])\n",
"-1832589236359639895: array([93])\n",
"-7160758790371681110: array([61])\n",
"-5956855780704653113: array([132])\n",
" 3855790383581213872: array([158])\n",
"-8135024449508051791: array([14])\n",
" 9112962236976043186: array([110])\n",
"-7578522677982032716: array([77])\n",
" 2720037140177958071: array([124])\n",
"-9070451507585774738: array([148])\n",
"-4867223236089227744: array([158])\n",
" 8615171685483859144: array([56])\n",
" 3374834073829534921: array([122])\n",
"-7619441719494961959: array([145])\n",
"-5674662932140528422: array([156])\n",
"-3480632094807185883: array([96])\n",
" 4499404417148474599: array([176])\n",
"-8576577865303429910: array([91])\n",
"-1530016287116245780: array([54])\n",
" 5521143955258539248: array([89])\n",
"-7811127819338001165: array([185])\n",
"-2060447784151507723: array([80])\n",
" 2657654997108202750: array([11])\n",
" 6637413547145524097: array([180])\n",
" 5349354633445213451: array([62])\n",
" 205452590132428046: array([136])\n",
"-2955246334984805105: array([9])\n",
"-4432910867332807122: array([151])\n",
"-6201492705344527082: array([8])\n",
" 3900243455927321883: array([120])\n",
"-3184044406973868767: array([113])\n",
" -506598636686774997: array([4])\n",
"-3257940141622878930: array([18])\n",
"-6324179141552646862: array([108])\n",
" 2733626712747889985: array([130])\n",
"-8469079076270625468: array([3])\n",
"-5655929388618328760: array([8])\n",
"-6111537904125004471: array([8])\n",
" 8950898979361635658: array([111])\n",
"-3268584335685431988: array([12])\n",
" 8738522800139330893: array([41])\n",
" 4342493052343416145: array([8])\n",
" 6787031755237848411: array([39])\n",
" 250217049015977308: array([150])\n",
" 2467694565535173986: array([82])\n",
"-7082555205082071709: array([183])\n",
"-5755883578289196484: array([150])\n",
"-4888427052387408534: array([182])\n",
" 8708891214205975916: array([165])\n",
" 3103394443266198894: array([157])\n",
"-6474759946447629421: array([105])\n",
"-4709800485218923144: array([34])\n",
"-5979165160281885317: array([119])\n",
" 8655538224932594922: array([132])\n",
"-2883492770340680322: array([48])\n",
" 4141786331112643967: array([84])\n",
"-4406390199689482877: array([186])\n",
" 4240683352046261641: array([34])\n",
" 7651163200713084298: array([39])\n",
" 323732618769952144: array([188])\n",
"-8220942672516258415: array([166])\n",
"-4349705370092126822: array([29])\n",
" -727510598820039266: array([117])\n",
" 6658426455281632672: array([16])\n",
" 2113629792565521825: array([153])\n",
"-7525965060916968030: array([51])\n",
" 3238063470345178549: array([106])\n",
" 3149732400129162375: array([183])\n",
"-5015258380638384703: array([123])\n",
" 5974626832071900630: array([26])\n",
" 704342580931694039: array([4])\n",
" 2085159436493278684: array([146])\n",
" 928116425838628317: array([180])\n",
"-8598934676102314521: array([71])\n",
"-4273602831328055825: array([9])\n",
"-7606126089011808783: array([32])\n",
" 1232980854946625017: array([111])\n",
" 2737574870118149626: array([141])\n",
" -416729781240277509: array([70])\n",
"-2481905073207003651: array([32])\n",
" 4500874289663364618: array([192])\n",
"-4514374511880931826: array([179])\n",
"-3815629446064126448: array([65])\n",
" 6073540604671071761: array([148])\n",
" 3678081373512594350: array([165])\n",
" 8436687494305844763: array([172])\n",
"-5015647589536420318: array([162])\n",
"-7163946651571352027: array([57])\n",
" 2351222169188646439: array([83])\n",
"-1714145197839661523: array([47])\n",
" 8811400566844315182: array([23])\n",
"-4297034911703548369: array([106])\n",
"-6183889904086036929: array([192])\n",
"-1834908564939920460: array([2])\n",
"-2517924025403793861: array([59])\n",
"-6170591281468692931: array([53])\n",
"-8573151303024382027: array([174])\n",
"-5408408320316400064: array([17])\n",
" 1098676450862657092: array([0])\n",
" 6351603501458767432: array([81])\n",
"-5124996556083360177: array([163])\n",
" 4796424721970333266: array([62])\n",
" 1233327882584815187: array([113])\n",
"-3273923508519504300: array([80])\n",
" 6230600771183775318: array([149])\n",
" 5903143007837576791: array([118])\n",
"-8347300743769885096: array([130])\n",
" 8506439248601492751: array([147])\n",
"-4218831102165866914: array([52])\n",
" 1122416928400047713: array([22])\n",
" 1307779299411856999: array([111])\n",
"-7224249093173823894: array([11])\n",
" 3868708993676244588: array([168])\n",
" 684434332482299503: array([149])\n",
" 3409779002937982581: array([118])\n",
" 1082880478641415798: array([69])\n",
"-1990052986571582528: array([122])\n",
" 5847613127090609794: array([94])\n",
" 7894592554338624107: array([167])\n",
" 2220488992868007556: array([76])\n",
" 3987083900477198614: array([10])\n",
"-2276971701855283575: array([55])\n",
"-8429417444664205680: array([96])\n",
"-8654975433403115878: array([119])\n",
" 7832821591241496222: array([163])\n",
"-2805159597266385246: array([107])\n",
"-1440922687003691349: array([14])\n",
" 4805360239149000369: array([198])\n",
"-1531106899555504462: array([91])\n",
" 7965226692649715381: array([51])\n",
" 8984178501645244086: array([102])\n",
" 7495020511740788341: array([140])\n",
" 1502664551022304960: array([10])\n",
" 4409711706368177857: array([185])\n",
" 3807254296485643971: array([140])\n",
" 6165471163463755461: array([74])\n",
" 361038015012916935: array([122])\n",
" 8773419424338743927: array([10])\n",
" 7907410630106562255: array([152])\n",
"-8086129370688182565: array([148])\n",
"-4027849311002034467: array([50])\n",
"-4491799614292369697: array([96])\n",
" 148429983513755363: array([114])\n",
" -558458340685400338: array([127])\n",
"-3299621932672450825: array([113])\n",
" 7456506835844417277: array([58])\n",
" -283606424844689660: array([118])\n",
" -709602441886839035: array([38])\n",
"-3862542896663423219: array([173])\n",
"-8510470872084054258: array([125])\n",
"-3525213422401892586: array([28])\n",
" 9172973469729574682: array([153])\n",
"-3722800266819981009: array([193])\n",
" -984650276611016932: array([35])\n",
" 2652967826372509474: array([13])\n",
"-8878549746098227413: array([5])\n",
"-5424667006326726865: array([196])\n",
" 7515263243628877619: array([14])\n",
"-1368854285588129993: array([198])\n",
"-3495741304698958018: array([168])\n",
"-3632432775130161341: array([51])\n",
" 4707688265664515910: array([134])\n",
" 3646964263675430881: array([4])\n",
" 4116324093852249928: array([58])\n",
" -780686685817962676: array([37])\n",
"-7406751718125951146: array([76])\n",
" 262037073526888281: array([156])\n",
"-3143498240690026657: array([176])\n",
" 5380100898334351200: array([12])\n",
" -134293980349472923: array([5])\n",
" 3043190066899812205: array([85])\n",
" 8235598071477190510: array([57])\n",
"-2729433327937675405: array([102])\n",
" 2152287022908166009: array([48])\n",
" 186891503684149120: array([143])\n",
" 791183963628633985: array([93])\n",
"-5123956445723390073: array([47])\n",
" 6890056110167396233: array([125])\n",
"-7219616124911841394: array([15])\n",
"-2828564175201341545: array([127])\n",
" 3033710795526897565: array([123])\n",
"-7026779527101685855: array([182])\n",
"-5720318730677616732: array([165])\n",
" 8093147758332426158: array([59])\n",
" 4238649073436579759: array([38])\n",
" 690183900078809008: array([98])\n",
"-6988302191411374157: array([99])\n",
" 63678250346071988: array([180])\n",
" 8985949513703303838: array([172])\n",
"-2348289424712001610: array([43])\n",
"-8417894611852292166: array([77])\n",
"-2177066172942925891: array([133])\n",
"-5883195022732671042: array([188])\n",
" 2634360864066817985: array([66])\n",
" 726679056062449606: array([86])\n",
"-5024203455956938809: array([175])\n",
"-4995423867204231214: array([110])\n",
" 8498636628145590227: array([98])\n",
"-4263942708411076651: array([179])\n",
"-1735819078863671335: array([59])\n",
" 3821059279430873424: array([41])\n",
" 4578714089656546647: array([102])\n",
"-3181093704174411805: array([69])\n",
" 8640762441285853157: array([124])\n",
" 1275063547491753961: array([175])\n",
" 4122369044275421168: array([120])\n",
" -137982345661284013: array([146])\n",
" 4268678903566027433: array([107])\n",
" 519413400269240312: array([137])\n",
" -669122388335052806: array([169])\n"
]
}
],
"source": [
"lsh_groups_sorted = sorted(lsh_groups.iteritems(), key=lambda (lsh_hash, indices): len(indices), reverse=True)\n",
"for (h, i) in lsh_groups_sorted:\n",
" print(\"%20d: %r\" % (h, np.array(i)))"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[[1, 0, 1, 1, 1, 1, 1, 1, 1, 1],\n",
" [1, 0, 1, 1, 1, 1, 0, 0, 1, 1],\n",
" [1, 0, 1, 1, 0, 1, 0, 1, 1, 1],\n",
" [1, 1, 1, 1, 1, 1, 0, 1, 1, 0],\n",
" [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
" [1, 1, 1, 1, 1, 1, 0, 0, 1, 1],\n",
" [1, 0, 1, 1, 1, 1, 0, 1, 1, 1],\n",
" [1, 1, 1, 1, 0, 1, 0, 1, 1, 0],\n",
" [1, 0, 1, 1, 0, 1, 0, 0, 1, 1],\n",
" [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]]"
]
},
"execution_count": 125,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"[features[idx] for idx in lsh_groups_sorted[0][1]]"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
"def jaccard(x, y):\n",
" a = np.array(x)\n",
" b = np.array(y)\n",
" union = float(sum(a | b)) \n",
" intersection = float(sum(a & b))\n",
" return round(intersection / union, 4)"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
"def calculate_similarities(rows):\n",
" sims = [(jaccard(a,b), a, b) for a in rows for b in rows]\n",
" sims = sorted(sims, key=lambda (j, a, b): j, reverse=True)\n",
" return sims"
]
},
{
"cell_type": "code",
"execution_count": 128,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0000: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"1.0000: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"1.0000: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"1.0000: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"1.0000: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"1.0000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"1.0000: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"1.0000: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"1.0000: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"1.0000: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"1.0000: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"1.0000: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.9000: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.9000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.8889: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.8889: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.8750: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.8750: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.8750: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.8750: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.8750: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.8750: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.8750: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.8750: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.8750: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.8750: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.8571: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"0.8571: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"0.8571: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.8571: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.8000: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.8000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.8000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.8000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.8000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.8000: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.8000: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.8000: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.7778: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.7778: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.7778: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.7778: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.7778: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.7778: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.7778: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.7778: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.7778: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.7778: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.7778: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.7778: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.7778: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.7778: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.7500: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.7500: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.7500: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.7500: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"0.7500: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"0.7500: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.7500: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.7500: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.7000: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.7000: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.7000: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.7000: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.7000: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.7000: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.7000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.7000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.7000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.7000: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.7000: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.7000: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.6667: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"0.6667: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.6667: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.6667: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.6667: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.6667: [1, 0, 1, 1, 0, 1, 0, 1, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.6667: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.6667: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.6667: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.6667: [1, 1, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.6667: [1, 0, 1, 1, 1, 1, 0, 1, 1, 1] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.6667: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 1, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.6667: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 0, 1, 1, 1]\n",
"0.6667: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.6667: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.6667: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 0, 1, 0, 1, 1, 1]\n",
"0.6250: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"0.6250: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.6000: [1, 0, 1, 1, 1, 1, 1, 1, 1, 1] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.6000: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"0.6000: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.6000: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
"0.5556: [1, 0, 1, 1, 1, 1, 0, 0, 1, 1] [1, 1, 1, 1, 0, 1, 0, 1, 1, 0]\n",
"0.5556: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n",
"0.5556: [1, 1, 1, 1, 0, 1, 0, 1, 1, 0] [1, 0, 1, 1, 1, 1, 0, 0, 1, 1]\n",
"0.5556: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.5556: [1, 0, 1, 1, 0, 1, 0, 0, 1, 1] [1, 1, 1, 1, 1, 1, 0, 1, 1, 0]\n",
"0.5556: [1, 1, 1, 1, 1, 1, 0, 1, 1, 0] [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]\n"
]
}
],
"source": [
"for (j, a, b) in calculate_similarities([features[idx] for idx in lsh_groups_sorted[0][1]]):\n",
" print(\"%.4f: %r %r\" % (j, a, b))"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.8908987181403393"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"EXPECTED_JACCARD_SIMILARITY"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment