## Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

# maxberggren/christophers_text_len.ipynb

Created May 25, 2016 16:28
Show Gist options
• Save maxberggren/6a77f2996bfe013adc9c68125c3aab5d to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
 { "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "collapsed": false }, "outputs": [], "source": [ "import scipy\n", "import pymc as pm\n", "import numpy as np\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "collapsed": false }, "outputs": [], "source": [ "### Christophers data\n", "men = pd.read_csv(\"http://genuskollen.se/men.csv\")[' freq'].values\n", "women = pd.read_csv(\"http://genuskollen.se/kvinnor.csv\")[' freq'].values\n", "\n", "### Put data in 20 equal sized bins\n", "n_bins = 20\n", "outlier_thres_r = np.percentile(men, 97.5) # Capture 97.5 % of data by cutting after this threshold\n", "outlier_thres_l = np.percentile(men, 2.5) # Thres that throws 2.5 % away\n", "bins = np.arange(outlier_thres_l, outlier_thres_r, outlier_thres_r/float(n_bins))\n", "men_binned, men_bins = np.histogram(men, bins=bins)\n", "women_binned, women_bins = np.histogram(women, bins=men_bins)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEACAYAAABbMHZzAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFHxJREFUeJzt3X/sXXd93/HnC4esS0L4mSZgjIyKVRJ1mTNNHmoKtSsa\nGbbVTJuaukXgkqaR1lA0Os3ijyXemFbGuglV2VKXegplqKHdEmRNRElQbQgMkrjKry12Gy91iZ0f\nmARok6bFad77454vOb75fr/3+nvv/d77vef5kK58z+ecz/l+7tHx637u5/xKVSFJ6pZXTLsBkqTV\nZ/hLUgcZ/pLUQYa/JHWQ4S9JHWT4S1IHDQz/JNuTHEnySJLdi8zfkeSBJPcl+aMkPzVsXUnSdGS5\n8/yTrAP+GHg3cAK4F9hZVYdby5xbVc817/8OcGtVvW2YupKk6RjU898CHK2qY1V1CrgZ2NFeYCH4\nG+cB3x62riRpOgaF/3rgsdb08absNEnel+QwcBvwq2dSV5K0+gaF/1D3fqiqL1TVxcA/Bj6bJCO3\nTJI0MWcNmH8C2NCa3kCvB7+oqroryVnA65rlBtZN4s2FJGkFqmrFHe1B4X8I2JRkI/A4cCWws71A\nkh8BHq2qSvL3mgY9neR7g+qO4wPodEn2VNWeabdjXrg9x8dtOV6jdpyXDf+qeiHJtcDtwDpgX1Ud\nTnJNM38v8E+BDyQ5BTwL/NxydUdprCRpPAb1/Kmq2+gdyG2X7W29/yTwyWHrSpKmzyt858/BaTdg\nzhycdgPmyMFpN0AvWfYir1VpQFKO+UvSmRk1OwcO+0jSIJ61N1mT6CAb/pLGwl/wkzGpL1bH/CWp\ngwx/Seogw1+SOsjwl6QOMvwlTUSSmvRriDYcS/LXSV7fV35fkheTvGVyW2C2ebaPpIk5cGBy6962\nbajFCniU3n3FboAfPHTqbzPkXYvnlT1/SfPuvwMfaE1/EPhdIABJ/laS30jyZ0meTHJjkh9q5m1N\ncjzJR5M8leTxJLtW/RNMgOEvad59Azg/ydubx8teSe8LAXpfAJ8A3gb83ebf9cB1rfoXAucDbwKu\nAv5LklevUtsnxvCX1AWfpdf7/2ngYXrPKoFe+F8NfLSqvltVzwK/TnN34sYp4N9W1d80N6t8FvjR\nVWv5hDjmL2neFb3wvwt4K60hH+AC4Bzgj1oPIAynd4yfrqoXW9N/Se955Wua4S9p7lXVN5M8CrwH\n+FBr1reB54FLquqJqTRuShz2kdQVVwE/VVXPt8peBD4NfCrJBQBJ1ie5YhoNXE32/CVNzJCnY66K\nqnq0v6h57aZ3gPcbSd5A73jAfwXuaC03d2byfv6LXbzhHQOl2eVzOSZnqW07t/fzb6e/e5QkjZdj\n/pLUQYa/JHWQ4S9JHWT4S1IHGf6S1EGGvyR1kOEvSR1k+EtSBxn+kiZiRh7j+LEkX+wre2SJsp8d\n9zaYZQPDP8n2JEeajbN7kfm/kOSBJA8m+VqSS1vzjjXl9yW5Z9yNlzTbaoKvIX0Z+PE092tO8kZ6\ndzbYnOQVrbIfAb4ywkddc5YN/+apNzcA24FLgJ1JLu5b7FHgXVV1KfBx4Ldb8wrYWlWXVdWW8TVb\nkoZyCHglsLmZfidwAPiTvrKjQJLsT/J009n9pYWVJNmT5A+SfDbJnzed2k3NL4unmkdA/nRr+Vcn\n2dc89vF4ko+3vmx2Jflqkv+Y5JkkjybZvgrb4jSDev5bgKNVdayqTgE3AzvaC1TV16vqe83k3cCb\n+9bhrXkkTUVVfZ9eLv1kU/Queg91+Wrzvl32eeCbwBuBfwb8+yTt+5L+I3oPgnktcB9wZ1P+Jnod\n372tZW8Cvk/vF8VlwBXAL7XmbwGOAK8HPgnsG+mDrsCg8F8PPNaaPt6ULeUqoD2WVsCXkhxKcvXK\nmihJI/kyLwX9T9Ab3rmrr+zLwI8Du6vq+1X1APA7nP7g969U1Z1V9TfA/6AX3J9opj8PbExyfpIL\n6T005l9U1fNVdRL4FKc/GvLPqmpf9W6r/LvAG5P88Pg/+tIG3dVz6KG15hvyQ8DlreLLq+qJ5iEJ\ndyY5UlV3raCdkrRSXwF+JclrgQuq6v8lOQl8pin7MXq98Geq6rlWvW8Cf781/a3W++eBb9dL98Rf\neEDMefRGP14JPNF6NOQrmvUteHLhTVX9ZbPceX1/Y6IGhf8JYENregO93v9pmoO8nwa2V9V3FsoX\nHotWVSeT3Ervp87Lwj/JntbkwSHbLknD+AbwanoPav8aQFX9eZLHgV+ml3OPA69Lcl7zEHeAt7BI\n3g3hMeCvgdf3Pft3JEm2AlvHtb5B4X8I2JRkI72NcyWws69BbwFuAd5fVUdb5ecA66rqL5KcS2/M\n698s9keqak/fOs/oQ0jSUqrq+SSHgI8C/64166tN2R1VdTzJ/wZ+Pcm/BH6U3kjGz6/g7z2R5A7g\nPyf518Bz9B4cv76qVnxGUVUdpNU5TnL9StcFA8b8q+oF4FrgduBh4PNVdTjJNUmuaRa7jt4BkBv7\nTum8CLgryf30Drj8r6q6A0mdkQm+ztCXgQvoBf6Cu4A38NIpnjuBjfQ6urcA11XVHzbzFjvDdLnp\nDwBn08vNZ4A/oJeJw65r4mb2MY79T/LyEXHS7PIxjpMzqcc4eoWvJHWQ4S9JHWT4S1IHGf6S1EGG\nvyR1kOEvSR006CIvSRrKMPfX1+ww/CWNzHP81x6HfSSpgwx/Seogw1+SOsjwl6QOMvwlqYMMf0nq\nIMNfkjrI8JekDjL8JamDDH9J6iDDX5I6yPCXpA4y/CWpgwx/Seogw1+SOsjwl6QOMvwlqYMMf0nq\nIMNfkjrI8JekDpqZB7gnqWm3QZK6YmDPP8n2JEeSPJJk9yLzfyHJA0keTPK1JJcOW7ffgQO9lyRp\nspYN/yTrgBuA7cAlwM4kF/ct9ijwrqq6FPg48NtnUFeSNAWDev5bgKNVdayqTgE3AzvaC1TV16vq\ne83k3cCbh60rSZqOQeG/HnisNX28KVvKVcAXV1hXkrRKBh3wHfogbJJtwIeAy1dQdw/ATTfB5s3D\n1pKk7kiyFdg6rvUNCv8TwIbW9AZ6Pfj+Rl0KfBrYXlXfOZO6AFW1J8n1u3YN2WpJ6piqOggcXJhO\ncv0o6xs07HMI2JRkY5KzgSuB/e0FkrwFuAV4f1UdPZO6kqTpWLbnX1UvJLkWuB1YB+yrqsNJrmnm\n7wWuA14L3JgE4FRVbVmq7gQ/iyRpSKma7rVVSaqqkqQWzvHftu30AwYBqipnut729JnWl6RZtpCd\nK60/17d3KM7gqLMkdchch78kaXEzc2+fUXlvIEka3tyEP5x+X6Bt26bXDkmadQ77SFIHGf6S1EGG\nvyR1kOEvSR1k+EtSBxn+ktRBhr8kdZDhL0kdZPhLUgcZ/pLUQYa/JHWQ4S9JHWT4S1IHGf6S1EGG\nvyR1kOEvSR1k+EtSBxn+ktRBhr8kdZDhL0kdZPhLUgcZ/pLUQYa/JHWQ4S9JHTQw/JNsT3IkySNJ\ndi8y/+1Jvp7kr5L8Wt+8Y0keTHJfknvG2XBJ0sqdtdzMJOuAG4B3AyeAe5Psr6rDrcWeBj4MvG+R\nVRSwtaqeGVN7JUljMKjnvwU4WlXHquoUcDOwo71AVZ2sqkPAqSXWkdGbKUkap0Hhvx54rDV9vCkb\nVgFfSnIoydVn2jhJ0mQsO+xDL7xHcXlVPZHkAuDOJEeq6q7+hZLsAbjpJti8ecS/KElzKMlWYOu4\n1jco/E8AG1rTG+j1/odSVU80/55Mciu9YaSXhX9V7Uly/a5dw65Zkrqlqg4CBxemk1w/yvoGDfsc\nAjYl2ZjkbOBKYP8Sy542tp/knCSvat6fC1wBPDRKYyVJ47Fsz7+qXkhyLXA7sA7YV1WHk1zTzN+b\n5CLgXuB84MUkHwEuAX4YuCXJwt/5XFXdMbmPIkka1qBhH6rqNuC2vrK9rfdPcvrQ0IJnAUfwJWkG\nDQz/Lkly2gHuqvI0VUlzyds79DlwoPeSpHlm+EtSBznsswyHgSTNK8N/Ge3kN/UlzROHfSSpgwx/\nSeogw1+SOsjwl6QOMvwlqYMMf0nqIMNfkjrI8JekDjL8JamDDH9J6iDDX5I6yPCXpA4y/CWpgwx/\nSeogw1+SOsjwl6QOMvwlqYMMf0nqIMNfkjrI8JekDjL8JamDDH9J6iDDX5I6aGD4J9me5EiSR5Ls\nXmT+25N8PclfJfm1M6krSZqOZcM/yTrgBmA7cAmwM8nFfYs9DXwY+I0V1J0rSar/Ne02SdJizhow\nfwtwtKqOASS5GdgBHF5YoKpOAieT/MMzrTuPDhx46f22bdNrhyQtZ9Cwz3rgsdb08aZsGKPUlSRN\n0KCe/yjDFkPXTbIH4KabYPPmEf6iJM2pJFuBreNa36DwPwFsaE1voNeDH8bQdatqT5Lrd+0acs2S\n1DFVdRA4uDCd5PpR1jdo2OcQsCnJxiRnA1cC+5dYNiPUlSStomV7/lX1QpJrgduBdcC+qjqc5Jpm\n/t4kFwH3AucDLyb5CHBJVT27WN1JfhhJ0nAGDftQVbcBt/WV7W29f5LTh3eWrStJmj6v8JWkDhrY\n89do2hd6VVX/cRFJmgp7/hNWjHa+rCRNguEvSR1k+EtSBxn+ktRBhr8kdZDhL0kdZPhLUgd5nv+M\n6X8AjNcGSJoEe/4z6MCB0x8KI0njZvhLUgcZ/pLUQYa/JHWQB3xnnAeAJU2C4T/j2slv6ksaF4d9\nJKmDDH9J6iDDX5I6yPCXpA4y/CWpgwx/Seogw1+SOsjwl6QO8iKvOdN/RTB4VbCklzP851D7dtDb\ntk2vHZJml8M+ktRBhr8kddDA8E+yPcmRJI8k2b3EMr/ZzH8gyWWt8mNJHkxyX5J7xtlwSdLKLTvm\nn2QdcAPwbuAEcG+S/VV1uLXMe4G3VdWmJP8AuBF4RzO7gK1V9cxEWi9JWpFBPf8twNGqOlZVp4Cb\ngR19y/wM8BmAqrobeE2SC1vzPdNEkmbMoPBfDzzWmj7elA27TAFfSnIoydWjNFSSND6DTvV82Tnj\nS1iqd/8TVfV4kguAO5Mcqaq7hm+eJGkSBoX/CWBDa3oDvZ79csu8uSmjqh5v/j2Z5FZ6w0gvC/8k\newBuugk2bx6+8RpO+8IvL/iS1qYkW4Gt41rfoGGfQ8CmJBuTnA1cCezvW2Y/8IGmce8AvltVTyU5\nJ8mrmvJzgSuAhxb7I1W1B2DXLsN/Eorhf8JJmk1VdbCq9iy8Rl3fsj3/qnohybXA7cA6YF9VHU5y\nTTN/b1V9Mcl7kxwFngN+sal+EXBLkoW/87mqumPUBkuSRjfw9g5VdRtwW1/Z3r7paxep9yhgP16S\nZpBX+EpSB3ljN71M/51BPUgszR97/lrUgQOn3x1U0nwx/CWpgxz20UAOA0nzx/DXQO3kN/Wl+WD4\na+x8lKQ0+wx/TYSPkpRmmwd8JamDDH9J6iCHfbQqvLOoNFvs+WtVeGdRabYY/pLUQQ77aCZ5YZk0\nWfb8NbO8v5A0Ofb8tSZ4wFgaL3v+WhM8YCyNl+EvSR3ksI/mkvcXkpZn+Gtu9d9f6EyPG3jGkeaZ\nwz7qjJUcN/CMI80rw1+SOshhH2lIow4bDVtvuXU49KRxMfylIS2k8Jmk76jHHdrr8LkIGieHfaRV\n5PUKmhX2/KU55imvWorhL60howwbwcqGjjzuMJ8Mf2kNWclxh3EY5biDvz5m08Ax/yTbkxxJ8kiS\n3Uss85vN/AeSXHYmdSWtriS18Fqtv7lwvcQ4rplot381P8O8WTb8k6wDbgC2A5cAO5Nc3LfMe4G3\nVdUm4JeBG4etK826+++fdgvGb9SDzisN3va2PNN19C9/pp+h/wvDL5DBPf8twNGqOlZVp4CbgR19\ny/wM8BmAqrobeE2Si4asK820eQz/Ua30y6O9LadxtXX/r49Rv0BW3pLZMCj81wOPtaaPN2XDLPOm\nIepK0prR/vJY618Eg8J/2A/lwRtJnbLWr9lI1dLNT/IOYE9VbW+mPwa8WFX/obXMbwEHq+rmZvoI\n8JPAWwfVbcrX8vaTpKkZ5aypQad6HgI2JdkIPA5cCezsW2Y/cC1wc/Nl8d2qeirJ00PU9ZQvSZqC\nZcO/ql5Ici1wO7AO2FdVh5Nc08zfW1VfTPLeJEeB54BfXK7uJD+MJGk4yw77SJLm01Rv7OZFYKNJ\ncizJg0nuS3JPU/a6JHcm+ZMkdyR5zbTbOauS/LckTyV5qFW25PZL8rFmXz2S5IrptHp2LbE99yQ5\n3uyj9yV5T2ue23MJSTYkOZDk/yb5P0l+tSkf3/5ZVVN50RsKOgpsBF4J3A9cPK32rMUX8KfA6/rK\nPgn8q+b9buAT027nrL6AdwKXAQ8N2n70LlS8v9lXNzb77ium/Rlm6bXE9rwe+Ogiy7o9l9+WFwGb\nm/fnAX8MXDzO/XOaPX8vAhuP/gPmP7jorvn3favbnLWjqu4CvtNXvNT22wH8XlWdqqpj9P5zbVmN\ndq4VS2xPWPxUcLfnMqrqyaq6v3n/LHCY3nVSY9s/pxn+w1xApuUV8KUkh5Jc3ZRdWFVPNe+fAi6c\nTtPWrKW235vo7aML3F+H9+Hmvl/7WsMUbs8hNWdMXgbczRj3z2mGv0eaR3d5VV0GvAf4lSTvbM+s\n3u9Bt/MKDbH93LaD3Ujvmp/NwBPAf1pmWbdnnyTnAf8T+EhV/UV73qj75zTD/wSwoTW9gdO/uTRA\nVT3R/HsSuJXez7ynmnsrkeSNwLem18I1aant17+/vrkp0zKq6lvVAH6Hl4Yi3J4DJHklveD/bFV9\noSke2/45zfD/wQVkSc6mdxHY/im2Z01Jck6SVzXvzwWuAB6itw0/2Cz2QeALi69BS1hq++0Hfi7J\n2UneCmwC7plC+9aUJqAW/BN6+yi4PZeVJMA+4OGq+lRr1tj2z6k9zKW8CGxUFwK39vYRzgI+V1V3\nJDkE/H6Sq4BjwM9Or4mzLcnv0bsVyRuSPAZcB3yCRbZfVT2c5PeBh4EXgH/e9GbVWGR7Xg9sTbKZ\n3hDEnwILF4i6PZd3OfB+4MEk9zVlH2OM+6cXeUlSB031Ii9J0nQY/pLUQYa/JHWQ4S9JHWT4S1IH\nGf6S1EGGvyR1kOEvSR30/wG4fx9RYGhtEQAAAABJRU5ErkJggg==\n", "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "### Viz data normed\n", "width = 2\n", "fig, ax = plt.subplots()\n", "ax.bar(men_bins[0:len(men_binned)], men_binned/float(men_binned.sum()), width, color='y', label=\"Men\")\n", "ax.bar(women_bins[0:len(men_binned)]+width, women_binned/float(women_binned.sum()), width, color='r', label=\"Women\")\n", "plt.legend()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " [-----------------100%-----------------] 165000 of 165000 complete in 29.3 sec\n", "Probability B > A: 100.0 %\n", "Interval of B:s lift over A:\n", "11.384920224 % - 17.9733530367 %\n" ] } ], "source": [ "### Start with uniform probability over the bins as prior\n", "p_A = pm.Dirichlet(\"p_A\", theta=np.ones(len(women_binned)))\n", "p_B = pm.Dirichlet(\"p_B\", theta=np.ones(len(men_binned)))\n", "\n", "### Use a multinomial dist. with the probabilitys of bins\n", "obs_A = pm.Multinomial(\"obs_A\", p=p_A, n=sum(women_binned), value=women_binned, observed=True)\n", "obs_B = pm.Multinomial(\"obs_B\", p=p_B, n=sum(men_binned), value=men_binned, observed=True)\n", "\n", "@pm.deterministic\n", "def percent_better(p_B=p_B, p_A=p_A):\n", " \"\"\" Calc how much better B is over A \"\"\"\n", " exp_len_men = np.dot(p_B.astype(float), men_bins[0:len(men_binned)-1])\n", " exp_len_women = np.dot(p_A.astype(float), women_bins[0:len(women_binned)-1])\n", "\n", " return ((exp_len_men / exp_len_women) - 1)*100.0 # Percent\n", "\n", "### Run MCMC on model\n", "model = pm.Model([p_A, p_B, obs_A, obs_B, percent_better])\n", "map_ = pm.MAP(model)\n", "map_.fit()\n", "mcmc = pm.MCMC(model)\n", "mcmc.sample(165000, burn=130000, thin=2)\n", "\n", "percent_better_samples = mcmc.trace(\"percent_better\")[:]\n", "\n", "print \"\\nProbability B > A: {} %\".format((percent_better_samples > 0).mean()*100.0)\n", "print \"Interval of B:s lift over A:\"\n", "print \"{} % - {} %\".format(np.percentile(percent_better_samples, 2.5), \n", " np.percentile(percent_better_samples, 97.5))\n", "#pm.Matplot.plot(mcmc)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "collapsed": false }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MannwhitneyuResult(statistic=65864616.5, pvalue=1.9425419207873861e-43)\n", "RanksumsResult(statistic=13.769438756793058, pvalue=3.8922512718382413e-43)\n", "Ttest_indResult(statistic=11.629082223056139, pvalue=3.9995265342515866e-31)\n" ] } ], "source": [ "# Some freq stats also...\n", "print scipy.stats.mannwhitneyu(men, women)\n", "print scipy.stats.ranksums(men, women)\n", "print scipy.stats.ttest_ind(men, women, equal_var=False) # Assumes normality" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 2", "language": "python", "name": "python2" } }, "nbformat": 4, "nbformat_minor": 0 }
to join this conversation on GitHub. Already have an account? Sign in to comment