Created
October 6, 2018 18:00
-
-
Save DavidMStraub/3fa7c90307635f49d43f2f5e55ea3c8c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import Levenshtein\n", | |
"import glob\n", | |
"import matplotlib.pyplot as plt\n", | |
"%matplotlib inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"wordlists = {}\n", | |
"fs = glob.glob('./*.txt')\n", | |
"for f in fs:\n", | |
" with open(f) as _f:\n", | |
" wordlists[f.split('.')[1].replace('/', '')] = _f.read().splitlines()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def distances(wordlist):\n", | |
" return [Levenshtein.distance(w1, w2) for w1 in wordlist for w2 in wordlist if w1 != w2]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"distances = {k: distances(v) for k, v in wordlists.items()}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD8CAYAAAB5Pm/hAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvAOZPmwAAGktJREFUeJzt3X90VPW57/H3QwgEEeFU1KUJlJwFC0gbwThBCspC6cFw+XVK8QDqKbnLmvoDlqfWcrjtbU9aWZZ2oXJEasXKxbYoIniqEfyFlxQVOCY0oGCUX8ZjikeotCgFFPC5f8yQG2AmTDKT2cnm81orKzPf2Xs/z0R8svPs73y3uTsiIhJeHYJOQEREWpcKvYhIyKnQi4iEnAq9iEjIqdCLiIScCr2ISMip0IuIhJwKvYhIyKnQi4iEXMegEwDo2bOn9+nTJ+g0RETalU2bNv3Z3S8403aBFnozGw+M79u3L9XV1UGmIiLS7pjZ+8lsF2jrxt0r3L2se/fuQaYhIhJq6tGLiIScCr2ISMi1iYux8Rw9epT6+nqOHDkSdCptWk5ODnl5eWRnZwedioi0UW220NfX19OtWzf69OmDmQWdTpvk7nz88cfU19eTn58fdDoi0ka12dbNkSNHOP/881Xkm2BmnH/++fqrR0SaFGihN7PxZrbowIEDiV7PcEbtj35GInImml4pIhJybbZHf6o+s1el9Xh1c8eecZsHHniAhx56iKKiIpYuXZrW+ABLliyhurqaBx98MO3HlvSqHTAw7vjAd2oznIlI87WbQh+EX/7ylzz//PMnXeg8duwYHTvqxyYi7UebvRgbtFtuuYXdu3czYcIEunfvTllZGaNHj+Zb3/oWx48f5/vf/z7FxcVceumlPPzwwwBUVlYycuRIJk+ezIABA7jhhhtwdwCqqqoYNmwYgwYNYsiQIXz66acA7Nmzh5KSEvr168esWbMCe78iEl46NU3gV7/6FS+88AJr167lwQcfpKKigtdee40uXbqwaNEiunfvTlVVFZ999hnDhw9n9OjRANTU1LBt2zYuueQShg8fzuuvv86QIUOYMmUKTz75JMXFxXzyySd06dIFgM2bN1NTU0Pnzp3p378/M2fOpFevXkG+dREJGRX6JE2YMKGhOL/00ku8+eabrFixAoADBw6wY8cOOnXqxJAhQ8jLywNg8ODB1NXV0b17dy6++GKKi4sBOO+88xqOO2rUKE5cjC4oKOD9999XoReRtFKhT1LXrl0bHrs7CxYs4Nprrz1pm8rKSjp37tzwPCsri2PHjuHuCadBxtteBJp/ATjRhIVkJh5IuKnQt8C1117LQw89xDXXXEN2djbbt28nNzc34fYDBgxgz549VFVVUVxczKefftrw14EEqDzOtN7y+J/paK6ERTfn+tPGCvN7x912eVoySazwscLTxt6a/lYrR5UgpL3Qm1kH4G7gPKDa3R9Lx3Hb0lnJt7/9berq6igqKsLdueCCC/j973+fcPtOnTrx5JNPMnPmTA4fPkyXLl1Ys2ZNBjMWkbNZUoXezBYD44C97v7VRuMlwL8DWcCv3X0uMBHIBfYD9WnPOIPq6uoAKC8vP2m8Q4cO3HPPPdxzzz0njY8cOZKRI0c2PG88P764uJiNGzeetH1paSmlpaUNz5977rm05C0i0liy0yuXACWNB8wsC1gIjAEKgGlmVgD0Bza4+53ArelLVUREWiKpQu/u64ieoTc2BNjp7rvd/XNgGdGz+XrgL7FtjqcrURERaZlUPjCVC3zQ6Hl9bOxp4FozWwCsS7SzmZWZWbWZVe/bty+FNEREpCmpXIyNN1/Q3f0QcNOZdnb3RcAigEgk4inkISIiTUjljL4eaPzJnjxgT3MOcKZlikVEJHWpnNFXAf3MLB/4EzAVOH2ScBPcvQKoiEQiN6eQh5xltJKkSPMkO73yCWAk0NPM6oF/c/dHzWwG8CLR6ZWL3X1bc4Kb2XhgfN++fc+8cbwPt6QiTR+MSUZdXR3jxo1j69atVFdX85vf/IYHHngg7raVlZXMmzdPUy1FJG2SKvTuPi3B+GpgdUuDn41n9JFIhEgkEnQaInIWadO3EmwLfve73zFkyBAGDx7Md77zHY4fP865557LD3/4QwYNGsTQoUP56KOPANi1axdDhw6luLiYH//4x5x77rmnHa+yspJx48YB8Ic//IHBgwczePBgLrvssoaliw8ePBh3qePQKu8e/0tE0kK3EmxCbW0tTz75JK+//jqbN28mKyuLpUuX8re//Y2hQ4eyZcsWRowYwSOPPALAHXfcwR133EFVVRWXXHLJGY8/b948Fi5cyObNm3n11Vcb1r+pqalh/vz5vP322+zevZvXX3+9Vd+n/H+FjxXG/RJpz7SoWRNeeeUVNm3a1LC88OHDh7nwwgvp1KlTw1n55ZdfzssvvwzAhg0bGta8uf7667nrrruaPP7w4cO58847ueGGG5g0aVLD8sbxljq+8sorW+U9ng0SLzCW4UREAhJooW/WxdgAuDvTp0/nZz/72Unj8+bNa1h2OJWlhWfPns3YsWNZvXo1Q4cObVjoTEsXS1olaoMlWDVTwifQQt/WL8aOGjWKiRMn8t3vfpcLL7yQ/fv3N/TR4xk6dCgrV65kypQpLFu27IzH37VrF4WFhRQWFrJhwwbeeecdevTokc63INKqmrsGfqI2mJZHbl3tp3WTwemQJxQUFDBnzhxGjx7NF198QXZ2NgsXLky4/fz587nxxhu59957GTt2LGe69jB//nzWrl1LVlYWBQUFjBkzhg0bNqT7bYjIWa79FPqATJkyhSlTppw0dvDgwYbHkydPZvLkyQDk5uayceNGzIxly5Y1TKPs06cPW7duBU5eynjBggWnxWtqqWMRkZZQjz6NNm3axIwZM3B3evToweLFi4NOSUREPfp0uuqqq9iyZUvQaYiInCTQefQiItL6VOhFREJOSyCIiISclkAQEQm5djO9Mt3rjSTzAY1hw4axfv166urqWL9+Pddf3/Ry+81ZjlhEJFPUo2/C+vXrgWgBf/zxx5u1byQSUZEXkTZBhb4JJ5YZnj17Nq+++iqDBw/m/vvvp66ujquuuoqioiKKiooafiE01ng54jfeeINhw4Zx2WWXMWzYMN59910AlixZwqRJkygpKaFfv37MmjUrc29ORM4a+sBUEubOnXvSXZ8OHTrEyy+/TE5ODjt27GDatGlUV1cn3H/AgAGsW7eOjh07smbNGn7wgx+wcuVKADZv3kxNTQ2dO3emf//+zJw5k169eiU8lohIc+kDUy1w9OhRZsyY0bBG/fbt25vc/sCBA0yfPp0dO3ZgZhw9erThtVGjRjWsiVNQUMD777+vQi8iaaXWTQvcf//9XHTRRWzZsoXq6mo+//zzJrf/0Y9+xNVXX83WrVupqKjgyJEjDa9pSWIRaW0q9Eno1q3bScsTHzhwgIsvvpgOHTrw29/+luPHjze5/4EDB8jNzQWifXkRkUxqN9Mrg1yv+tJLL6Vjx44MGjSI0tJSbrvtNr75zW/y1FNPcfXVV9O1a9cm9581axbTp0/nvvvu45prrslQ1iIiUe2m0AfhxHLE2dnZvPLKKye99uabbzY8PnEHqkTLEX/ta187qY9/9913A1BaWkppaWnD+ImLvRKV6LMTyzOch0h7p9aNiEjIaa0bEZGQ01o3IiIhpx69nKw8wS/dBPfsrR0wMO74wHdq447Hu5l0XU5yqYlIy6hHLyIScir0IiIh125aN4laBC2VqLUgIhI2OqNvJVrKQETainZzRh+Uu+++m6VLl9KrVy969uzJ5Zdfzje+8Q1uv/129u3bxznnnMMjjzzCgAEDKC0t5Utf+hI1NTUUFRXRrVs33nvvPT788EO2b9/Offfdx8aNG3n++efJzc2loqKC7OxsfvrTn1JRUcHhw4cZNmwYDz/8MGbGyJEjueKKK1i7di1//etfefTRR7nqqquC/pGISDujM/omVFdXs3LlSmpqanj66acbliIuKytjwYIFbNq0iXnz5nHbbbc17LN9+3bWrFnDvffeC8CuXbtYtWoVzzzzDDfeeCNXX301b731Fl26dGHVqugMlBkzZlBVVcXWrVs5fPjwSZ+QPXbsGG+88Qbz58/nJz/5SQbfvYiERdrP6M1sJHA3sA1Y5u6V6Y6RKa+99hoTJ06kS5cuAIwfP54jR46wfv16rrvuuobtPvvss4bH1113HVlZWQ3Px4wZQ3Z2NoWFhRw/fpySkhIACgsLqaurA2Dt2rX84he/4NChQ+zfv5+vfOUrjB8/HoBJkyYBcPnllzdsLyLSHEkVejNbDIwD9rr7VxuNlwD/DmQBv3b3uYADB4EcoD7tGWeQu5829sUXX9CjRw82b94cd59TFzg7sQxxhw4dyM7Oxswanh87dowjR45w2223UV1dTa9evSgvL4+7jLGWMBaRlkq2dbMEKGk8YGZZwEJgDFAATDOzAuBVdx8D/CvQrnsNV155ZcP68QcPHmTVqlWcc8455Ofn89RTTwHRXwZbtmxpcYwTRb1nz54cPHiQFStWpCV3EZETkjqjd/d1ZtbnlOEhwE533w1gZsuAie7+duz1vwCdSZMgpkMWFxczYcIEBg0axJe//GUikQjdu3dn6dKl3HrrrcyZM4ejR48ydepUBg0a1KIYPXr04Oabb6awsJA+ffpQXFyc5nchIme7VHr0ucAHjZ7XA1eY2STgWqAH8GCinc2sDCgD6N27dwpptK677rqL8vJyDh06xIgRI/je975Hfn4+L7zwwmnbnnpTkfLy8pOen1j2+NTX5syZw5w5c047XmVlZcPjnj17qkcvIi2SSqG3OGPu7k8DT59pZ3dfBCwCiEQipzfD24iysjLefvttjhw5wvTp0ykqKgo6JRGRZkml0NcDje9inQfsac4BzGw8ML5v374ppNG6Hn/88aBTEBFJSSqFvgroZ2b5wJ+AqcD1zTmAu1cAFZFI5OYErzfMUpH44s0MEglcolVQ81Nv08ZbARWgbu7YlI8dVknNujGzJ4ANQH8zqzezm9z9GDADeBGoBZa7+7bmBG/qxiM5OTl8/PHHKmRNcHc+/vhjcnK0zq+IJJbsrJtpCcZXA6tbGrypM/q8vDzq6+vZt29fSw9/VsjJySEvLy/oNESkDWuza91kZ2eTn58fdBoiIu1eoIW+PVyMFZH2rfCxwrjjb01/K8OZBEf3jBURCTmtXikiEnKBFvqmZt2IiEh6qHUjIhJyat2IiIScWjciIiEX6PTKMy2BIG1HoilqyzOch4g0n1o3IiIhp0IvIhJy6tGLiIScpleKiIScWjciIiGnQi8iEnJtdpliaX3x7tRTp3uYiISOLsaKiIScLsaKiIScevQiIiGnQi8iEnK6GNuWlSdoaZXrmoaIJE9n9CIiIacz+hCpHTAw7vjAd2oznImItCU6oxcRCTnNoxcRCTnNoxcRCTm1bkREQk6FXkQk5FToRURCToVeRCTkVOhFREJOhV5EJORU6EVEQq5VCr2ZdTWzTWY2rjWOLyIiyUuq0JvZYjPba2ZbTxkvMbN3zWynmc1u9NK/AsvTmaiIiLRMsmf0S4CSxgNmlgUsBMYABcA0Mysws68DbwMfpTFPERFpoaRWr3T3dWbW55ThIcBOd98NYGbLgInAuUBXosX/sJmtdvcvTj2mmZUBZQC9e/duaf4iInIGqSxTnAt80Oh5PXCFu88AMLNS4M/xijyAuy8CFgFEIhFPIQ8REWlCKoXe4ow1FGx3X3LGA5iNB8b37ds3hTRERKQpqcy6qQd6NXqeB+xpzgG0eqWISOtLpdBXAf3MLN/MOgFTgWfTk5aIiKRLstMrnwA2AP3NrN7MbnL3Y8AM4EWgFlju7tuaE1w3HhERaX3JzrqZlmB8NbC6pcHdvQKoiEQiN7f0GCIi0jTdSlBEJOR0K0ERkZDTomYiIiGn1o2ISMipdSMiEnJq3YiIhFwqSyCkTEsgtEzhY4Vxx7UutIjEE2ih1zx6EWkP+sxeFXe8bu7YDGfSMmrdiIiEnAq9iEjIaXqliEjIqUffBiTs/+VkOBERCSW1bkREQk6FXkQk5FToRURCToVeRCTkNOtGRCTktKiZiEjIqXUjIhJyKvQiIiGnQi8iEnIq9CIiIadCLyIScppeKSIScppeKSIScmrdiIiEnAq9iEjIBboefbtSnqC9VB7/+kLtgIFxxwe+U5uujEREkqIzehGRkFOhFxEJORV6EZGQU6EXEQm5tBd6MxtoZr8ysxVmdmu6jy8iIs2TVKE3s8VmttfMtp4yXmJm75rZTjObDeDute5+C/BPQCT9KYuISHMke0a/BChpPGBmWcBCYAxQAEwzs4LYaxOA14BX0papiIi0SFKF3t3XAftPGR4C7HT33e7+ObAMmBjb/ll3HwbckM5kRUSk+VL5wFQu8EGj5/XAFWY2EpgEdAZWJ9rZzMqAMoDevXunkIaIiDQllUJvccbc3SuByjPt7O6LgEUAkUjEU8hDRESakMqsm3qgV6PnecCe5hxAyxSLiLS+VAp9FdDPzPLNrBMwFXi2OQfQMsUiIq0v2emVTwAbgP5mVm9mN7n7MWAG8CJQCyx3923NCa4zehGR1pdUj97dpyUYX00TF1yTOG4FUBGJRG5u6TFERKRpupWgiEjI6VaCIiIhp0XNRERCTq0bEZGQU+tGRCTk1LoREQk5FXoRkZBTj15EJOTUoxcRCTm1bkREQk6FXkQk5NSjFxEJOfXoRURCTq0bEZGQU6EXEQk5FXoRkZDTxVgRkZDTxVgRkZBT60ZEJORU6EVEQk6FXkQk5FToRURCToVeRCTkNL1SRCTkNL1SRCTk1LoREQk5FXoRkZBToRcRCTkVehGRkFOhFxEJORV6EZGQU6EXEQm5Vin0ZvaPZvaImT1jZqNbI4aIiCQn6UJvZovNbK+ZbT1lvMTM3jWznWY2G8Ddf+/uNwOlwJS0ZiwiIs3SnDP6JUBJ4wEzywIWAmOAAmCamRU02uR/x14XEZGAJF3o3X0dsP+U4SHATnff7e6fA8uAiRb1c+B5d/9j+tIVEZHmSrVHnwt80Oh5fWxsJvB1YLKZ3RJvRzMrM7NqM6vet29fimmIiEgiHVPc3+KMubs/ADzQ1I7uvghYBBCJRDzFPEREJIFUz+jrgV6NnucBe5LdWcsUi4i0vlQLfRXQz8zyzawTMBV4NtmdtUyxiEjra870yieADUB/M6s3s5vc/RgwA3gRqAWWu/u2ZhxTZ/QiIq0s6R69u09LML4aWN2S4O5eAVREIpGbW7K/iIicmZZAEBEJOd0zVkQk5HTPWBGRkFPrRkQk5NS6EREJObVuRERCTq0bEZGQU+tGRCTk1LoREQk5tW5EREJOhV5EJOTUoxcRCblUbzySkjAsalb4WGHc8eUZzkNEJBG1bkREQk6FXkQk5FToRURCThdjRURCTh+YEhEJObVuRERCToVeRCTkVOhFREJOhV5EJOTM3YPOATP7FHg3wBR6An8OMH5byOFsj98Wcjjb47eFHNpb/C+7+wVn2ijQJRAaedfdI0EFN7PqIOO3hRzO9vhtIYezPX5byCGs8dW6EREJORV6EZGQayuFftFZHh+Cz+Fsjw/B53C2x4fgcwhl/DZxMVZERFpPWzmjFxGRVhJ4oTezEjN718x2mtnsDMdebGZ7zWxrJuM2it/LzNaaWa2ZbTOzOwLIIcfM3jCzLbEcfpLpHGJ5ZJlZjZk9F0DsOjN7y8w2m1l1puPHcuhhZivM7J3Yv4evZTB2/9h7P/H1iZn9S6bix3L4buzf31Yze8LMcjIZP5bDHbH42zLx/uPVHzP7kpm9bGY7Yt//Li3B3D2wLyAL2AX8PdAJ2AIUZDD+CKAI2BrQ+78YKIo97gZsz+T7j8U14NzY42zgP4GhAfws7gQeB54LIHYd0DOIfwONcngM+HbscSegR0B5ZAH/TXR+dqZi5gLvAV1iz5cDpRl+318FtgLnEJ12vgbo18oxT6s/wC+A2bHHs4GfpyNW0Gf0Q4Cd7r7b3T8HlgETMxXc3dcB+zMVL078D939j7HHnwK1RP/RZzIHd/eDsafZsa+MXrgxszxgLPDrTMZtK8zsPKL/0z8K4O6fu/tfA0pnFLDL3d/PcNyOQBcz60i02O7JcPyBwEZ3P+Tux4A/AN9ozYAJ6s9Eor/0iX3/x3TECrrQ5wIfNHpeT4YLXVthZn2Ay4ieUWc6dpaZbQb2Ai+7e6ZzmA/MAr7IcNwTHHjJzDaZWVkA8f8e2Af8n1j76tdm1jWAPACmAk9kMqC7/wmYB/wX8CFwwN1fymQORM/mR5jZ+WZ2DvA/gF4ZzgHgInf/EKIngsCF6Tho0IXe4oydddOAzOxcYCXwL+7+Sabju/txdx8M5AFDzOyrmYptZuOAve6+KVMx4xju7kXAGOB2MxuR4fgdif4J/5C7Xwb8jeif7RllZp2ACcBTGY77d0TPZPOBS4CuZnZjJnNw91rg58DLwAtE28jHMplDawq60Ndz8m/NPDL/J1ugzCybaJFf6u5PB5lLrF1QCZRkMOxwYIKZ1RFt3V1jZr/LYHzcfU/s+17gP4i2FDOpHqhv9JfUCqKFP9PGAH90948yHPfrwHvuvs/djwJPA8MynAPu/qi7F7n7CKItlR2ZzgH4yMwuBoh935uOgwZd6KuAfmaWHzubmAo8G3BOGWNmRrQvW+vu9wWUwwVm1iP2uAvR/+neyVR8d/9f7p7n7n2I/vf/v+6esbM5M+tqZt1OPAZGE/0zPmPc/b+BD8ysf2xoFPB2JnOImUaG2zYx/wUMNbNzYv9PjCJ6vSqjzOzC2PfewCSC+Vk8C0yPPZ4OPJOOgwa6qJm7HzOzGcCLRK/2L3b3bZmKb2ZPACOBnmZWD/ybuz+aqfhEz2b/GXgr1iMH+IG7r85gDhcDj5lZFtFf/MvdPeNTHAN0EfAf0fpCR+Bxd38hgDxmAktjJzy7gf+ZyeCxvvQ/AN/JZFwAd/9PM1sB/JFou6SGYD6hutLMzgeOAre7+19aM1i8+gPMBZab2U1EfwFel5ZYsWk8IiISUkG3bkREpJWp0IuIhJwKvYhIyKnQi4iEnAq9iEjIqdCLiIScCr2ISMip0IuIhNz/A+T2o7v7IwhtAAAAAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0x7f17e654bc50>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"plt.hist(distances.values(), label=distances.keys())\n", | |
"plt.legend()\n", | |
"plt.xticks(range(11));\n", | |
"plt.yscale('log')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment