Skip to content

Instantly share code, notes, and snippets.

@sonalisharma
Created May 16, 2013 18:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sonalisharma/5593987 to your computer and use it in GitHub Desktop.
Save sonalisharma/5593987 to your computer and use it in GitHub Desktop.
{
"metadata": {
"name": "Final Analysis"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import sqlite3 as lite\n",
"import os\n",
"import sys\n",
"from itertools import islice\n",
"from pandas import DataFrame\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.mlab as mlab"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"USA LOCATION ANALYSIS"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Connecting to database to get all user related data\n",
"con = lite.connect(\"../../Database/Books.db\")\n",
"con.text_factory = str\n",
"c = con.cursor()\n",
"query = \"SELECT u.UserID,u.location, u.age from BXUsers u where location like '%usa%'\"\n",
"print query\n",
"c.execute(query)\n",
"rows = c.fetchall()\n",
"df_users = DataFrame(rows,columns=['UserId','location','age'])\n",
"df_users.set_index('UserId')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"SELECT u.UserID,u.location, u.age from BXUsers u where location like '%usa%'\n"
]
},
{
"output_type": "pyout",
"prompt_number": 39,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 139971 entries, 1 to 278857\n",
"Data columns:\n",
"location 139971 non-null values\n",
"age 76669 non-null values\n",
"dtypes: float64(1), object(1)"
]
}
],
"prompt_number": 39
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"MAX and MIN age"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Find minimum and maximum ages \n",
"print \"(min,max)\"\n",
"df_users['age'].min(), df_users['age'].max()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"(min,max)\n"
]
},
{
"output_type": "pyout",
"prompt_number": 40,
"text": [
"(0.0, 237.0)"
]
}
],
"prompt_number": 40
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Age Distribution"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Plot the distribution of age\n",
"plt.figure(0)\n",
"plt.hist(df_users['age'], bins=np.arange(df_users['age'].min(), df_users['age'].max(), 5))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 41,
"text": [
"(array([ 353, 96, 1382, 6537, 7628, 9151, 9778, 8793, 7600, 7152, 6846,\n",
" 5138, 3075, 1704, 849, 321, 119, 20, 18, 6, 59, 3,\n",
" 3, 7, 2, 1, 0, 1, 0, 5, 2, 3, 1,\n",
" 1, 0, 1, 0, 0, 0, 1, 3, 2, 1, 1,\n",
" 0, 5, 0]),\n",
" array([ 0., 5., 10., 15., 20., 25., 30., 35., 40.,\n",
" 45., 50., 55., 60., 65., 70., 75., 80., 85.,\n",
" 90., 95., 100., 105., 110., 115., 120., 125., 130.,\n",
" 135., 140., 145., 150., 155., 160., 165., 170., 175.,\n",
" 180., 185., 190., 195., 200., 205., 210., 215., 220.,\n",
" 225., 230., 235.]),\n",
" <a list of 47 Patch objects>)"
]
},
{
"output_type": "display_data",
"png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD9CAYAAACm2+DgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAFvNJREFUeJzt3X9o1Pfhx/HXBWv2w8T5g2ghvTprdpdTq0n7yYWWNFeR\nEgZtMkuJhbixZKxNOzI7GcIsNIHRYO2wiczTjV7HJhLGINB12tAIF5uN3F2YrGAuarZ2saWNZope\nNOns+vn+ka+f1uYd29xdcvnxfMBBfN+935/3552Pn1fen/v8cNm2bQsAgC/IynQHAACzEwEBADAi\nIAAARgQEAMCIgAAAGBEQAACj2wZEbW2tVq1apY0bNzpliURClZWVcrvdqqqq0sjIiPNea2urCgoK\n5PP51N3d7ZTH43EVFxdr7dq12rNnj1N+48YN1dXV6e6771YgENBHH32UznUDAKTgtgHxwx/+UG++\n+eYtZcFgUG63W+fOnVN+fr4OHTokSbpw4YIOHjyoEydOKBgMqqGhwamza9cu7d69W7FYTF1dXert\n7ZUktbe368qVK4rH46qoqNAvf/nLdK8fACBJtw2IsrIyLVu27JayaDSquro6ZWdnq7a2VpFIRJIU\niURUUVEht9ut8vJy2bbtzC7OnDmj6upqrVixQtu2bbulTk1Njb7xjW/oxz/+sVMOAMi8KX8HEYvF\n5PV6JUler1fRaFTS+M6+sLDQ+ZzH41EkEtHAwIDy8vKccp/Pp56eHknjYePz+SRJy5cv19DQkD7+\n+OPk1wYAkDaLplphKnfmcLlcxvo3y23bvqW9ydo2tQMA+HKp3E1pyjMIy7IUj8cljX/5bFmWJMnv\n96uvr8/5XH9/vyzL0rp16zQ0NOSU9/X1ye/3T6hz6dIlrVq1StnZ2cbl3gyThf564YUXMt6H2fJi\nLBgLxuL2r1RNOSD8fr9CoZBGR0cVCoVUWloqSSopKVFHR4cGBwcVDoeVlZWlnJwcSeOHotra2jQ8\nPKz29vZbAuLIkSO6du2afvOb3zhtAQAy77YB8eSTT+qBBx7Q2bNnddddd+m1115TfX29BgcH5fF4\n9MEHH+jpp5+WJK1atUr19fXasmWLnnnmGbW0tDjtvPzyy3rppZdkWZbKysp0//33S5K+973vaenS\npSosLNSbb76p559/fhpXFQAwFS47HfOQaeZyudIyXZoPwuGwAoFAprsxKzAWn2EsPsNYfCbVfScB\nkQa5ucuVSFw2vpeTs0xXr16a4R4BAAExK4yfZTVZ/2Z33wHMX6nuO7kXEwDAiIAAABgREAAAIwIC\nAGBEQAAAjAgIAIARAQEAMCIgAABGBAQAwIiAAAAYERAAACMCAgBgREBMQW7ucrlcrgkvAJiPuJvr\nFPthvmsrd3MFMPtwN1cAwLQgIAAARgQEAMCIgAAAGBEQAAAjAgIAYERATLtFxmsnXC6XcnOXZ7pz\nADAproOYYj+SuQ6CayQAZALXQQAApgUBAQAwIiAAAEYEBADAiIAAABgREAAAIwICAGBEQAAAjAgI\nAIARAQEAMCIgAABGSQfEb3/7Wz3wwAO67777tHPnTklSIpFQZWWl3G63qqqqNDIy4ny+tbVVBQUF\n8vl86u7udsrj8biKi4u1du1a7dmzJ4VVAQCkU1IBcenSJb344ot66623FIvFdPbsWXV0dCgYDMrt\nduvcuXPKz8/XoUOHJEkXLlzQwYMHdeLECQWDQTU0NDht7dq1S7t371YsFlNXV5d6e3vTs2YAgJQk\nFRBf//rXZdu2rly5otHRUV2/fl3f+ta3FI1GVVdXp+zsbNXW1ioSiUiSIpGIKioq5Ha7VV5eLtu2\nndnFmTNnVF1drRUrVmjbtm1OHQBAZiUdEMFgUGvWrNHq1av14IMPyu/3KxaLyev1SpK8Xq+i0aik\n8YAoLCx06ns8HkUiEQ0MDCgvL88p9/l86unpSWV9AABpsiiZShcvXlR9fb36+vq0bNkyPfHEE3rj\njTemdN/x8Wcr3Op29RsbG52fA4GAAoHAVLoMAPNeOBxWOBxOW3tJBUQ0GlVpaanWrVsnSXriiSf0\n9ttvy7IsxeNxFRUVKR6Py7IsSZLf71dnZ6dTv7+/X5ZlKScnR0NDQ055X1+fSktLjcv8fEAAACb6\n4h/PTU1NKbWX1CGmsrIy9fb26tKlS/r44491/PhxPfLII/L7/QqFQhodHVUoFHJ29iUlJero6NDg\n4KDC4bCysrKUk5MjafxQVFtbm4aHh9Xe3i6/35/SCgEA0iPpR47+7ne/02uvvabr16+roqJCTU1N\nunbtmmpqanTq1CkVFxfryJEjWrJkiSSppaVFBw4c0OLFi3X48GGVlZVJGp811NTU6PLly9q+fbua\nm5sndpJHjgLAlKW67+SZ1FPsBwEBYK7gmdQAgGlBQAAAjAgIAIARAQEAMCIgviA3d7lcLpfxlX6L\njMvJzV0+DcsCgKnhLCbDsqZ+RlLyZzFN1t4c+LUAmOU4iwkAMC0ICACAEQEBADAiIAAARgQEAMCI\ngAAAGBEQAAAjAgIAYERAzErmK6y5yhrATErqkaOYbp9osquvE4npuOUHAEzEDAIAYERAAACMCAgA\ngBEBAQAwIiAAAEYExJzDKbAAZganuc45nAILYGYwgwAAGBEQAAAjAgIAYERAAACMCAgAgNGCDIjc\n3OWTnioKABjnsm3bfM7kLOJyuZTObo4HwWTtJfPe7GlvDvw6AcyQVPedC3IGAQD4cgQEAMCIgAAA\nGBEQAAAjAgIAYERAAACMkg6Ia9eu6Qc/+IG+853vyOfzKRKJKJFIqLKyUm63W1VVVRoZGXE+39ra\nqoKCAvl8PnV3dzvl8XhcxcXFWrt2rfbs2ZPa2gAA0ibpgHjhhRfkdrv1zjvv6J133pHX61UwGJTb\n7da5c+eUn5+vQ4cOSZIuXLiggwcP6sSJEwoGg2poaHDa2bVrl3bv3q1YLKauri719vamvlYAgJQl\nHRCdnZ36xS9+oa997WtatGiRli5dqmg0qrq6OmVnZ6u2tlaRSESSFIlEVFFRIbfbrfLyctm27cwu\nzpw5o+rqaq1YsULbtm1z6gAAMiupgHj//fc1Njam+vp6+f1+7d27V6Ojo4rFYvJ6vZIkr9eraDQq\naTwgCgsLnfoej0eRSEQDAwPKy8tzyn0+n3p6elJZHwBAmiT1RLmxsTGdPXtW+/bt09atW/XUU0/p\nj3/845Qu6Tbd9+h29RsbG52fA4GAAoHAVLoMAPNeOBxWOBxOW3tJBcS6devk8Xj06KOPSpKefPJJ\n/f73v5dlWYrH4yoqKlI8HpdlWZIkv9+vzs5Op35/f78sy1JOTo6Ghoac8r6+PpWWlhqX+fmAAABM\n9MU/npuamlJqL+nvIAoKChSJRPTpp5/qL3/5i7Zu3Sq/369QKKTR0VGFQiFnZ19SUqKOjg4NDg4q\nHA4rKytLOTk5ksYPRbW1tWl4eFjt7e3y+/0prRAAID2Svpvr2bNn9f3vf19jY2PaunWrmpqa9Omn\nn6qmpkanTp1ScXGxjhw5oiVLlkiSWlpadODAAS1evFiHDx9WWVmZpPFZQ01NjS5fvqzt27erubl5\nYie5m+tXfo+7uQK4KdV9J7f7nvhuEu/NnvbmwK8TwAzhdt8AgGlBQAAAjAgIAIARAQEAMCIgAABG\nBAQAwIiAAAAYERAAACMCAgBgREAAAIwICACAEQEBADAiIAAARgQEAMCIgJhXFsnlchlfubnLM905\nAHNMUo8cxWz1iSZ7VkQiMfEZ4ABwO8wgAABGBAQAwIiAAAAYERAAACMCAgBgREAAAIwICACAEQEB\nADAiIAAARgQEAMCIgAAAGBEQAAAjAgIAYERAAACMCAgAgBEBAQAwIiAAAEYEBADAiIAAABgREAAA\no6QD4n//+5+Kior06KOPSpISiYQqKyvldrtVVVWlkZER57Otra0qKCiQz+dTd3e3Ux6Px1VcXKy1\na9dqz549KawGACDdkg6IlpYW+Xw+uVwuSVIwGJTb7da5c+eUn5+vQ4cOSZIuXLiggwcP6sSJEwoG\ng2poaHDa2LVrl3bv3q1YLKauri719vamuDoAgHRJKiDef/99HTt2TD/60Y9k27YkKRqNqq6uTtnZ\n2aqtrVUkEpEkRSIRVVRUyO12q7y8XLZtO7OLM2fOqLq6WitWrNC2bducOgCAzEsqIJ577jnt27dP\nWVmfVY/FYvJ6vZIkr9eraDQqaTwgCgsLnc95PB5FIhENDAwoLy/PKff5fOrp6UlqJQAA6bdoqhXe\neOMN5eXlqaioSOFw2Cm/OZP4Km4elvq8L6vf2Njo/BwIBBQIBL7y8gBgIQiHw7fsl1M15YD429/+\nptdff13Hjh3T2NiYrl69qh07dsiyLMXjcRUVFSkej8uyLEmS3+9XZ2enU7+/v1+WZSknJ0dDQ0NO\neV9fn0pLSydd7ucDAgAw0Rf/eG5qakqpvSkfYnrxxRd1/vx5vfvuu2pra9OWLVv0hz/8QX6/X6FQ\nSKOjowqFQs7OvqSkRB0dHRocHFQ4HFZWVpZycnIkjR+Kamtr0/DwsNrb2+X3+1NaGQBA+qR8HcTN\nw0X19fUaHByUx+PRBx98oKefflqStGrVKtXX12vLli165pln1NLS4tR9+eWX9dJLL8myLJWVlen+\n++9PtTsAgDRx2VP58iBDXC7XlL7j+CrtSZO1l8x7s7298ffmwK8aQBqluu/kSmoAgBEBAQAwIiAA\nAEYEBADAiIAAABgREAAAIwICAGBEQAAAjAgIAIARAQEAMCIgAABGBAQAwIiAAAAYERAAACMCAgBg\nREAsGIvkcrkmvHJzl2e6YwBmqSk/kxpz1ScyPUwokXDNfFcAzAnMIAAARgQEAMCIgAAAGBEQAAAj\nAgIAYERAAACMCAgAgBEBAQAwIiAAAEYEBADAiIAAABgREAAAIwICAGBEQAAAjAgIAIARAQEAMCIg\nAABGBAQAwCipgDh//rwefvhhrV+/XoFAQEePHpUkJRIJVVZWyu12q6qqSiMjI06d1tZWFRQUyOfz\nqbu72ymPx+MqLi7W2rVrtWfPnhRXBwCQLkkFxB133KH9+/fr9OnT+tOf/qTnn39eiURCwWBQbrdb\n586dU35+vg4dOiRJunDhgg4ePKgTJ04oGAyqoaHBaWvXrl3avXu3YrGYurq61Nvbm541AwCkJKmA\nWL16tTZv3ixJWrlypdavX69YLKZoNKq6ujplZ2ertrZWkUhEkhSJRFRRUSG3263y8nLZtu3MLs6c\nOaPq6mqtWLFC27Ztc+oAADIr5e8gBgYGdPr0aZWUlCgWi8nr9UqSvF6votGopPGAKCwsdOp4PB5F\nIhENDAwoLy/PKff5fOrp6Um1SwCANFiUSuVEIqHq6mrt379fS5YskW3bX7muy+WaUHa7+o2Njc7P\ngUBAgUBgKl0FgHkvHA4rHA6nrb2kA+LGjRt6/PHHtWPHDlVWVkqSLMtSPB5XUVGR4vG4LMuSJPn9\nfnV2djp1+/v7ZVmWcnJyNDQ05JT39fWptLTUuLzPBwQAYKIv/vHc1NSUUntJHWKybVt1dXXasGGD\ndu7c6ZT7/X6FQiGNjo4qFAo5O/uSkhJ1dHRocHBQ4XBYWVlZysnJkTR+KKqtrU3Dw8Nqb2+X3+9P\naYUAAOnhsqdyXOj/dXd366GHHtK9997rHCpqbm7Wgw8+qJqaGp06dUrFxcU6cuSIlixZIklqaWnR\ngQMHtHjxYh0+fFhlZWWSxmcNNTU1unz5srZv367m5uaJnXS5pnT46suM93my9pJ5b7a3d/tlpXNs\nAcweqe47kwqImUZATO+y5sAmACAJqe47uZIaAGBEQAAAjAiIBW+RXC6X8ZWbuzzTnQOQQSldB4H5\n4BNN9r1FIjHxWhUACwczCACAEQEBADAiIAAARgQEAMCIgAAAGM3rgMjNXW48fRMA8OXm9WmuicRl\nTX4rCwDA7czrGQQAIHkEBADAiIAAABgREAAAIwICAGBEQAAAjAgIAIARAQEAMCIgAABGBAQAwIiA\nwG3wOFJgIZvX92JCqngcKbCQMYMAABgREAAAIwICAGBEQAAAjAgIAIARAQEAMCIgAABGBAQAwIiA\nAAAYzZmA4JYPADCz5kxAjN/yYeIrkbic0V4tXOb7NBHYwPzBvZiQJPN9mrhHEzB/zKEZBABgJs2K\ngDh58qQKCwtVUFCgAwcOZLo7mCPC4XCmuzBrMBafYSzSZ1YExE9/+lMdPnxYnZ2d+vWvf63h4eFM\ndwlJm7lnSLAj+Axj8RnGIn0yHhBXrlyRJD300EO6++679cgjjygSiUyhhcl3SMiEm99NTP8JBc3N\nezm7DZhGGQ+IWCwmr9fr/Nvn86mnp2cKLUy+Q8JsM3mYu1yLp7yj/+9/x8TZbcD0mUNnMd1uRpDM\newutvZlcVjKztxvG0kTi8pfMBid/b6HNIpuamjLdhVmDsUiPjAeEZVn6+c9/7vz79OnTqqiouOUz\nts1sAABmWsYPMS1dulTS+JlM7733nt566y35/f4M9woAkPEZhCS98soreuqpp3Tjxg01NDRo5cqV\nme4SACx4GZ9BSFJ5ebni8bgGBgbU0NDglC/06yPWrFmje++9V0VFRSopKZEkJRIJVVZWyu12q6qq\nSiMjIxnu5fSora3VqlWrtHHjRqfsduve2tqqgoIC+Xw+dXd3Z6LL08Y0Fo2NjcrPz1dRUZGKiop0\n/Phx5735PBbnz5/Xww8/rPXr1ysQCOjo0aOSFua2MdlYpHXbsGexzZs3211dXfZ7771nezwe++LF\ni5nu0oxas2aN/Z///OeWsr1799o/+clP7LGxMfvZZ5+19+3bl6HeTa+TJ0/af//73+0NGzY4ZZOt\n+9DQkO3xeOx///vfdjgctouKijLV7WlhGovGxkb7V7/61YTPzvex+PDDD+1Tp07Ztm3bFy9etL/9\n7W/bV69eXZDbxmRjkc5tY1bMIExSvz5ifrC/8AV9NBpVXV2dsrOzVVtbO2/HpKysTMuWLbulbLJ1\nj0QiqqiokNvtVnl5uWzbViKRyES3p4VpLCTzyRvzfSxWr16tzZs3S5JWrlyp9evXKxaLLchtY7Kx\nkNK3bczagEj9+oi5z+VyacuWLaqqqtLrr78u6dZx8Xq9ikajmezijJps3SORiAoLC53PeTyeBTEu\nBw4cUGlpqfbu3ev8R49GowtmLAYGBnT69GmVlJQs+G3j5ljcPMEnXdvGrA0ISH/961/1j3/8Q83N\nzfrZz36mjz76aEGf8juVdZ/v10DU19fr3XffVUdHh/75z3/q8OHDksxjNB/HIpFIqLq6Wvv379eS\nJUsW9Lbx+bH45je/mdZtY9YGhGVZ6u/vd/59+vRplZaWZrBHM+/OO++UJBUWFuqxxx7Tn//8Z1mW\npXg8LkmKx+OyLCuTXZxRk6273+9XX1+f87n+/v55Py55eXlyuVxaunSpnn32WbW3t0taGGNx48YN\nPf7449qxY4cqKyslLdxtwzQW6dw2Zm1ALPTrI65fv+5MDS9evKiOjg5VVFTI7/crFAppdHRUoVBo\nQYXmZOteUlKijo4ODQ4OKhwOKysrSzk5ORnu7fT68MMPJUmffPKJjh49qu9+97uS5v9Y2Laturo6\nbdiwQTt37nTKF+K2MdlYpHXbSMOX6dMmHA7bXq/Xvueee+yWlpZMd2dG/etf/7I3bdpkb9q0yd6y\nZYv96quv2rZt21evXrUfe+wx+6677rIrKyvtRCKR4Z5Oj+3bt9t33nmnvXjxYjs/P98OhUK3XfdX\nXnnFvueee+zCwkL75MmTGex5+t0cizvuuMPOz8+3X331VXvHjh32xo0b7fvuu89+7rnnbjnbbT6P\nxdtvv227XC5706ZN9ubNm+3Nmzfbx48fX5Dbhmksjh07ltZtw2XbC/igNgBgUrP2EBMAILMICACA\nEQEBADAiIAAARgQEAMCIgAAAGP0fszo/xKJL6NUAAAAASUVORK5CYII=\n"
}
],
"prompt_number": 41
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Mean age"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Find mean age\n",
"mean_usa = df_users['age'].mean()\n",
"mean_usa"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 46,
"text": [
"37.89514666944919"
]
}
],
"prompt_number": 46
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Standard deviation"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Find standard deviation\n",
"std_usa = df_users['age'].std(axis=1)\n",
"std_usa"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 47,
"text": [
"15.119432049277446"
]
}
],
"prompt_number": 47
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#function to calculate z score\n",
"def zscore(age):\n",
" return (float(age)-mean_usa)/std_usa"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 48
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Z-score calculation"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#create a new column in the dataframe to store z-score\n",
"df_users['z_index']=df_users['age'].apply(zscore)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 49
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Data frame with z-scores (displaying top 50)"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#display dataframe\n",
"df_users[:50]"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>UserId</th>\n",
" <th>location</th>\n",
" <th>age</th>\n",
" <th>z_index</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0 </th>\n",
" <td> 1</td>\n",
" <td> nyc, new york, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1 </th>\n",
" <td> 2</td>\n",
" <td> stockton, california, usa</td>\n",
" <td> 18</td>\n",
" <td>-1.315866</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2 </th>\n",
" <td> 6</td>\n",
" <td> santa monica, california, usa</td>\n",
" <td> 61</td>\n",
" <td> 1.528156</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3 </th>\n",
" <td> 7</td>\n",
" <td> washington, dc, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4 </th>\n",
" <td> 9</td>\n",
" <td> germantown, tennessee, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5 </th>\n",
" <td> 12</td>\n",
" <td> fort bragg, california, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6 </th>\n",
" <td> 14</td>\n",
" <td> mediapolis, iowa, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7 </th>\n",
" <td> 16</td>\n",
" <td> albuquerque, new mexico, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8 </th>\n",
" <td> 17</td>\n",
" <td> chesapeake, virginia, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9 </th>\n",
" <td> 20</td>\n",
" <td> langhorne, pennsylvania, usa</td>\n",
" <td> 19</td>\n",
" <td>-1.249726</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td> 23</td>\n",
" <td> philadelphia, pennsylvania, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td> 25</td>\n",
" <td> oakland, california, usa</td>\n",
" <td> 55</td>\n",
" <td> 1.131316</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td> 26</td>\n",
" <td> bellevue, washington, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td> 27</td>\n",
" <td> chicago, illinois, usa</td>\n",
" <td> 32</td>\n",
" <td>-0.389905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td> 30</td>\n",
" <td> anchorage, alaska, usa</td>\n",
" <td> 24</td>\n",
" <td>-0.919026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td> 32</td>\n",
" <td> portland, oregon, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td> 33</td>\n",
" <td> costa mesa, california, usa</td>\n",
" <td> 34</td>\n",
" <td>-0.257625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td> 35</td>\n",
" <td> grafton, wisconsin, usa</td>\n",
" <td> 17</td>\n",
" <td>-1.382006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td> 39</td>\n",
" <td> cary, north carolina, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td> 40</td>\n",
" <td> tonawanda, new york, usa</td>\n",
" <td> 32</td>\n",
" <td>-0.389905</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td> 41</td>\n",
" <td> santee, california, usa</td>\n",
" <td> 14</td>\n",
" <td>-1.580426</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td> 42</td>\n",
" <td> appleton, wisconsin, usa</td>\n",
" <td> 17</td>\n",
" <td>-1.382006</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td> 44</td>\n",
" <td> black mountain, north carolina, usa</td>\n",
" <td> 51</td>\n",
" <td> 0.866756</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td> 48</td>\n",
" <td> chicago, illinois, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td> 51</td>\n",
" <td> renton, washington, usa</td>\n",
" <td> 34</td>\n",
" <td>-0.257625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td> 53</td>\n",
" <td> tacoma, washington, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td> 54</td>\n",
" <td> eubank, kentucky, usa</td>\n",
" <td> 44</td>\n",
" <td> 0.403775</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td> 56</td>\n",
" <td> cheyenne, wyoming, usa</td>\n",
" <td> 24</td>\n",
" <td>-0.919026</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td> 59</td>\n",
" <td> asheville, north carolina, usa</td>\n",
" <td> 23</td>\n",
" <td>-0.985166</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td> 61</td>\n",
" <td> winfield, alabama, usa</td>\n",
" <td> 30</td>\n",
" <td>-0.522185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td> 62</td>\n",
" <td> kennewick, washington, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td> 63</td>\n",
" <td> nyack, new york, usa</td>\n",
" <td> 57</td>\n",
" <td> 1.263596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td> 67</td>\n",
" <td> framingham, massachusetts, usa</td>\n",
" <td> 43</td>\n",
" <td> 0.337635</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td> 70</td>\n",
" <td> rochester, new york, usa</td>\n",
" <td> 44</td>\n",
" <td> 0.403775</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td> 73</td>\n",
" <td> wentzville, missouri, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td> 75</td>\n",
" <td> long beach, california, usa</td>\n",
" <td> 37</td>\n",
" <td>-0.059205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td> 76</td>\n",
" <td> charleston, south carolina, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td> 78</td>\n",
" <td> oakland, california, usa</td>\n",
" <td> 18</td>\n",
" <td>-1.315866</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td> 81</td>\n",
" <td> santa cruz, california, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td> 82</td>\n",
" <td> del mar, california, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td> 83</td>\n",
" <td> eugene, oregon, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td> 84</td>\n",
" <td> san diago, california, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td> 86</td>\n",
" <td> los angeles, california, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td> 87</td>\n",
" <td> richardson, texas, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td> 88</td>\n",
" <td> fayetteville, georgia, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td> 89</td>\n",
" <td> plano, texas, usa</td>\n",
" <td> 54</td>\n",
" <td> 1.065176</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td> 90</td>\n",
" <td> powhatan, virginia, usa</td>\n",
" <td> 42</td>\n",
" <td> 0.271495</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td> 93</td>\n",
" <td> eden, wisconsin, usa</td>\n",
" <td> 50</td>\n",
" <td> 0.800616</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td> 95</td>\n",
" <td> grand island, new york, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td> 97</td>\n",
" <td> mechanicsburg, pennsylvania, usa</td>\n",
" <td>NaN</td>\n",
" <td> NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"output_type": "pyout",
"prompt_number": 53,
"text": [
" UserId location age z_index\n",
"0 1 nyc, new york, usa NaN NaN\n",
"1 2 stockton, california, usa 18 -1.315866\n",
"2 6 santa monica, california, usa 61 1.528156\n",
"3 7 washington, dc, usa NaN NaN\n",
"4 9 germantown, tennessee, usa NaN NaN\n",
"5 12 fort bragg, california, usa NaN NaN\n",
"6 14 mediapolis, iowa, usa NaN NaN\n",
"7 16 albuquerque, new mexico, usa NaN NaN\n",
"8 17 chesapeake, virginia, usa NaN NaN\n",
"9 20 langhorne, pennsylvania, usa 19 -1.249726\n",
"10 23 philadelphia, pennsylvania, usa NaN NaN\n",
"11 25 oakland, california, usa 55 1.131316\n",
"12 26 bellevue, washington, usa NaN NaN\n",
"13 27 chicago, illinois, usa 32 -0.389905\n",
"14 30 anchorage, alaska, usa 24 -0.919026\n",
"15 32 portland, oregon, usa NaN NaN\n",
"16 33 costa mesa, california, usa 34 -0.257625\n",
"17 35 grafton, wisconsin, usa 17 -1.382006\n",
"18 39 cary, north carolina, usa NaN NaN\n",
"19 40 tonawanda, new york, usa 32 -0.389905\n",
"20 41 santee, california, usa 14 -1.580426\n",
"21 42 appleton, wisconsin, usa 17 -1.382006\n",
"22 44 black mountain, north carolina, usa 51 0.866756\n",
"23 48 chicago, illinois, usa NaN NaN\n",
"24 51 renton, washington, usa 34 -0.257625\n",
"25 53 tacoma, washington, usa NaN NaN\n",
"26 54 eubank, kentucky, usa 44 0.403775\n",
"27 56 cheyenne, wyoming, usa 24 -0.919026\n",
"28 59 asheville, north carolina, usa 23 -0.985166\n",
"29 61 winfield, alabama, usa 30 -0.522185\n",
"30 62 kennewick, washington, usa NaN NaN\n",
"31 63 nyack, new york, usa 57 1.263596\n",
"32 67 framingham, massachusetts, usa 43 0.337635\n",
"33 70 rochester, new york, usa 44 0.403775\n",
"34 73 wentzville, missouri, usa NaN NaN\n",
"35 75 long beach, california, usa 37 -0.059205\n",
"36 76 charleston, south carolina, usa NaN NaN\n",
"37 78 oakland, california, usa 18 -1.315866\n",
"38 81 santa cruz, california, usa NaN NaN\n",
"39 82 del mar, california, usa NaN NaN\n",
"40 83 eugene, oregon, usa NaN NaN\n",
"41 84 san diago, california, usa NaN NaN\n",
"42 86 los angeles, california, usa NaN NaN\n",
"43 87 richardson, texas, usa NaN NaN\n",
"44 88 fayetteville, georgia, usa NaN NaN\n",
"45 89 plano, texas, usa 54 1.065176\n",
"46 90 powhatan, virginia, usa 42 0.271495\n",
"47 93 eden, wisconsin, usa 50 0.800616\n",
"48 95 grand island, new york, usa NaN NaN\n",
"49 97 mechanicsburg, pennsylvania, usa NaN NaN"
]
}
],
"prompt_number": 53
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Minimum age within z-score -3 to +3"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#display min age which is greater than -3.0 standard deviation\n",
"df_users[df_users['z_index']>=-3.0][['age']].min()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 60,
"text": [
"age 0"
]
}
],
"prompt_number": 60
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Maximum age within z-score -3 to +3"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#display max age which is less than 3.0 standard deviation\n",
"df_users[df_users['z_index']<=3.0][['age']].max()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 61,
"text": [
"age 83"
]
}
],
"prompt_number": 61
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Replacing missing age by mean age"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#query to replace missing age by mean age\n",
"query_upd = \"update BXUsers set age=\"+str(mean_usa)+\" where age is null and location like '%usa%'\"\n",
"print query_upd"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"update BXUsers set age=37.8951466694 where age is null and location like '%usa%'\n"
]
}
],
"prompt_number": 63
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#replacing missinhg value by mean age\n",
"c.execute(query_upd)\n",
"con.commit()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 64
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Selecting users after replacing missing age with mean age and fetching usa based records"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Select all records after replacing null ages with mean age\n",
"query = \"SELECT u.UserID,u.location, u.age from BXUsers u where location like '%usa%'\"\n",
"print query\n",
"c.execute(query)\n",
"rows_new = c.fetchall()\n",
"df_users_new = DataFrame(rows_new,columns=['UserId','location','age'])\n",
"df_users_new.set_index('UserId')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"SELECT u.UserID,u.location, u.age from BXUsers u where location like '%usa%'\n"
]
},
{
"output_type": "pyout",
"prompt_number": 65,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 139971 entries, 1 to 278857\n",
"Data columns:\n",
"location 139971 non-null values\n",
"age 139971 non-null values\n",
"dtypes: float64(1), object(1)"
]
}
],
"prompt_number": 65
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Reploting distribution after replacing missing age"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#plot distribution after updating ages\n",
"plt.figure(0)\n",
"plt.hist(df_users_new['age'], bins=np.arange(df_users_new['age'].min(), df_users_new['age'].max(), 5))"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 66,
"text": [
"(array([ 353, 96, 1382, 6537, 7628, 9151, 9778, 72095, 7600,\n",
" 7152, 6846, 5138, 3075, 1704, 849, 321, 119, 20,\n",
" 18, 6, 59, 3, 3, 7, 2, 1, 0,\n",
" 1, 0, 5, 2, 3, 1, 1, 0, 1,\n",
" 0, 0, 0, 1, 3, 2, 1, 1, 0,\n",
" 5, 0]),\n",
" array([ 0., 5., 10., 15., 20., 25., 30., 35., 40.,\n",
" 45., 50., 55., 60., 65., 70., 75., 80., 85.,\n",
" 90., 95., 100., 105., 110., 115., 120., 125., 130.,\n",
" 135., 140., 145., 150., 155., 160., 165., 170., 175.,\n",
" 180., 185., 190., 195., 200., 205., 210., 215., 220.,\n",
" 225., 230., 235.]),\n",
" <a list of 47 Patch objects>)"
]
},
{
"output_type": "display_data",
"png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAD9CAYAAACm2+DgAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3X9M3Pd9x/HnoSTXsRAvkEEswcFw2P3ID/tIj0OrPC5W\nFZ0ixYddVaQa0WRucua1YunQZG+OVCxZrbx0ikM0cDzlpG0oQ1UlpKSLw4q1w6ETd8fmNRJ3UNiS\nkU4Gm9LBQTAm62d/0HwT4i8YG2zAvB7SSfC+7+fD5/Px+fvi+/3e93AYYwwiIiJfkLPRAxARkc1J\nASEiIrYUECIiYksBISIithQQIiJiSwEhIiK2bhgQf/u3f8vv/d7v8eSTT/Liiy8CkM1miUQiuFwu\namtrmZmZsbZvaWmhoqICn89Hb2+vVc9kMlRWVlJeXs7x48et+sLCAtFolNLSUkKhEGNjY+s5PxER\nuUUrBsTk5CTf/e53+fGPf0wqleJnP/sZXV1dtLW14XK5GB4epri4mDNnzgBw+fJlWltbOX/+PG1t\nbTQ2Nlp9NTU1cfToUVKpFD09PfT39wPQ2dnJ1NQUmUyGcDjMyZMnb+N0RURktVYMiN/4jd/AGMPU\n1BRzc3N8/PHH/NZv/RbJZJJoNIrT6aShoYFEIgFAIpEgHA7jcrmoqanBGGMdXQwNDVFXV0dBQQEH\nDx5c0qa+vp7c3FwOHz5s1UVEZGPdMCDa2tooKyvj4Ycf5itf+QrBYJBUKoXH4wHA4/GQTCaBxZ29\n1+u12rvdbhKJBCMjIxQWFlp1n89HX18fAMlkEp/PB0B+fj7j4+PMz8+v7yxFROSm3bPSk1euXOHI\nkSOk02kefPBBvv71r/OjH/2Im/l0DofDcV3NGGPVjTFL+luub7t+RETkxm71E5VWPIJIJpNUV1fz\nyCOPUFBQwNe//nXee+89AoEAmUwGWLz4HAgEAAgGg6TTaav94OAggUCARx55hPHxcaueTqcJBoPX\ntZmcnKSoqAin07nsJPUwfOc739nwMWyWh9ZCa6G1WPmxFisGxN69e+nv72dycpL5+XnOnTvH008/\nTTAYJBaLMTc3RywWo7q6GoCqqiq6uroYHR0lHo+Tk5NDXl4esHgqqqOjg4mJCTo7O5cERHt7O7Oz\ns5w9e9bqS0RENtaKp5geeOABXnrpJQ4cOMDHH39MOBzmqaeeoqqqivr6etxuN5WVlZw6dQqAoqIi\njhw5wr59+7jvvvt4/fXXrb6+//3vU19fz1/8xV/w3HPP8eUvfxmAAwcO8O677+L1eikvL6ejo+M2\nTldERFbLYdZ6DHKHOByONR8u3S3i8TihUGijh7EpaC0+o7X4jNbiM2vZdyogRETuYmvZd+qjNkRE\nxJYCQkREbCkgRETElgJCRERsKSBERMSWAkJERGwpIERExJYCQkREbCkgRETElgJCRERsKSBERMSW\nAkJERGwpIERExJYCQkREbCkgRETElgJCRERsKSDugAceyMfhcFz3eOCB/I0emojIsvQX5e4Ah8MB\n2I19685JRLaG2/YX5YaGhvD7/dZjx44dtLS0MDMzQyQSweVyUVtby8zMjNWmpaWFiooKfD4fvb29\nVj2TyVBZWUl5eTnHjx+36gsLC0SjUUpLSwmFQoyNjd3SREREZH2tGBBut5uLFy9y8eJF/u3f/o3c\n3FwOHDhAa2srLpeL4eFhiouLOXPmDACXL1+mtbWV8+fP09bWRmNjo9VXU1MTR48eJZVK0dPTQ39/\nPwCdnZ1MTU2RyWQIh8OcPHnyNk5XRERWa9XXILq7u3nkkUcoKSkhmUwSjUZxOp00NDSQSCQASCQS\nhMNhXC4XNTU1GGOso4uhoSHq6uooKCjg4MGDS9rU19eTm5vL4cOHrbqIiGysVQdER0cH3/jGNwBI\npVJ4PB4APB4PyWQSWNzZe71eq43b7SaRSDAyMkJhYaFV9/l89PX1AZBMJvH5fADk5+czPj7O/Pz8\nGqclIiJrdc9qNrp27Rpvv/02p06dAripCx6LF2iXMsZYdWPMkv5W6ru5udn6OhQKEQqFVj0OEZHt\nIB6PE4/H16WvVQXEuXPnePLJJ/nt3/5tAAKBAJlMBr/fTyaTIRAIABAMBunu7rbaDQ4OEggEyMvL\nY3x83Kqn02mCwaDVJp1O43a7mZycpKioCKfTaTuOzweEiIhc74u/PJ84ceKW+1rVKaZ//Md/tE4v\nweJOPRaLMTc3RywWo7q6GoCqqiq6uroYHR0lHo+Tk5NDXl4esHgqqqOjg4mJCTo7O5cERHt7O7Oz\ns5w9e9bqS0RENpi5gZmZGVNQUGCmp6et2vT0tNm/f78pKSkxkUjEZLNZ67nTp0+bXbt2Ga/Xay5c\nuGDVBwYGjN/vN2VlZebYsWNW/dq1a+bQoUOmpKTE1NTUmEuXLtmOYxVD3bQAA8bmsXXnJCJbw1r2\nM7pR7g7QjXIislFu241yIiKyfSkgRETElgJCRERsKSBERMSWAkJERGwpIERExJYCQkREbCkgRETE\nlgJCRERsKSBERMSWAkJERGwpIERExJYCQkREbCkgRETElgJCRERsKSBERMSWAkJERGwpIERExNYN\nA2J2dpY//MM/5Hd/93fx+XwkEgmy2SyRSASXy0VtbS0zMzPW9i0tLVRUVODz+ejt7bXqmUyGyspK\nysvLOX78uFVfWFggGo1SWlpKKBRibGxsnacoIiK34oYB8Z3vfAeXy8X777/P+++/j8fjoa2tDZfL\nxfDwMMXFxZw5cwaAy5cv09rayvnz52lra6OxsdHqp6mpiaNHj5JKpejp6aG/vx+Azs5OpqamyGQy\nhMNhTp48eZumKiIiN+OGAdHd3c1f/uVf8qUvfYl77rmHHTt2kEwmiUajOJ1OGhoaSCQSACQSCcLh\nMC6Xi5qaGowx1tHF0NAQdXV1FBQUcPDgwSVt6uvryc3N5fDhw1ZdREQ21ooB8fOf/5yrV69y5MgR\ngsEgp06dYm5ujlQqhcfjAcDj8ZBMJoHFnb3X67Xau91uEokEIyMjFBYWWnWfz0dfXx8AyWQSn88H\nQH5+PuPj48zPz6/vLEVE5Kbds9KTV69e5Wc/+xkvv/wyX/3qV3nhhRf4wQ9+gDFm1T/A4XBcVzPG\nWHVjzJL+Vuq7ubnZ+joUChEKhVY9DhGR7SAejxOPx9elrxUD4pFHHsHtdvPss88C8I1vfIO///u/\nJxAIkMlk8Pv9ZDIZAoEAAMFgkO7ubqv94OAggUCAvLw8xsfHrXo6nSYYDFpt0uk0brebyclJioqK\ncDqdtuP5fECIiMj1vvjL84kTJ265rxteg6ioqCCRSPCrX/2Kf/qnf+KrX/0qwWCQWCzG3NwcsViM\n6upqAKqqqujq6mJ0dJR4PE5OTg55eXnA4qmojo4OJiYm6OzsXBIQ7e3tzM7OcvbsWasvERHZYOYG\nhoaGTDAYNLt37zZNTU1mZmbGTE9Pm/3795uSkhITiURMNpu1tj99+rTZtWuX8Xq95sKFC1Z9YGDA\n+P1+U1ZWZo4dO2bVr127Zg4dOmRKSkpMTU2NuXTpku04VjHUTQswYGweW3dOIrI1rGU/4/h1B5ue\nw+G4qWsfm8ni9Ra7sW/dOYnI1rCWfafupBYREVsKCBERsaWAEBERWwoIERGxpYAQERFbCggREbGl\ngBAREVsKCBERsaWAEBERWwoIERGxpYAQERFbCggREbGlgBAREVsKCBERsaWAEBERWwoIERGxpYAQ\nERFbCggREbGlgBAREVs3DIiysjKeeOIJ/H4/VVVVAGSzWSKRCC6Xi9raWmZmZqztW1paqKiowOfz\n0dvba9UzmQyVlZWUl5dz/Phxq76wsEA0GqW0tJRQKMTY2Nh6zk9ERG7RDQPC4XAQj8e5ePEiyWQS\ngLa2NlwuF8PDwxQXF3PmzBkALl++TGtrK+fPn6etrY3Gxkarn6amJo4ePUoqlaKnp4f+/n4AOjs7\nmZqaIpPJEA6HOXny5O2Yp4iI3KRVnWIyxiz5PplMEo1GcTqdNDQ0kEgkAEgkEoTDYVwuFzU1NRhj\nrKOLoaEh6urqKCgo4ODBg0va1NfXk5uby+HDh626iIhsrFUdQezbt4/a2lreeustAFKpFB6PBwCP\nx2MdWSQSCbxer9XW7XaTSCQYGRmhsLDQqvt8Pvr6+oDFsPH5fADk5+czPj7O/Pz8Ok1PRERu1T03\n2uAnP/kJO3fuJJPJ8Oyzz1JVVXXdEcVKHA7HdTVjjFU3xizpb6W+m5ubra9DoRChUGjV4xAR2Q7i\n8TjxeHxd+rphQOzcuRMAr9fL/v37efvttwkEAmQyGfx+P5lMhkAgAEAwGKS7u9tqOzg4SCAQIC8v\nj/HxcaueTqcJBoNWm3Q6jdvtZnJykqKiIpxOp+1YPh8QIiJyvS/+8nzixIlb7mvFU0wff/wx2WwW\ngCtXrtDV1UU4HCYYDBKLxZibmyMWi1FdXQ1AVVUVXV1djI6OEo/HycnJIS8vD1g8FdXR0cHExASd\nnZ1LAqK9vZ3Z2VnOnj1r9SUiIhvLYVY4p/PBBx9w4MABAAoKCviDP/gDGhoayGaz1NfXc/HiRSor\nK2lvb+f+++8H4NVXX+W1117jvvvu4/XXX2fv3r3A4lFDfX09v/zlL3nuuef43ve+Byy+zfWFF16g\nu7ub8vJyOjo6ePjhh68fqMNxU6e2NpPF02l2Y9+6cxKRrWEt+84VA2IzUUCIiNy8tew7dSe1iIjY\nUkCIiIgtBYSIiNhSQIiIiC0FhIiI2FJAiIiILQWEiIjYUkCIiIgtBYSIiNhSQIiIiC0FhIiI2FJA\niIiILQWEiIjYUkCIiIgtBYSIiNhSQIiIiC0FhIiI2FJAiIiILQWEiIjYWlVA/N///R9+v59nn30W\ngGw2SyQSweVyUVtby8zMjLVtS0sLFRUV+Hw+ent7rXomk6GyspLy8nKOHz9u1RcWFohGo5SWlhIK\nhRgbG1uvuYmIyBqsKiBeffVVfD4fDocDgLa2NlwuF8PDwxQXF3PmzBkALl++TGtrK+fPn6etrY3G\nxkarj6amJo4ePUoqlaKnp4f+/n4AOjs7mZqaIpPJEA6HOXny5HrPUUREbsENA+LnP/8577zzDn/0\nR3+EMQaAZDJJNBrF6XTS0NBAIpEAIJFIEA6Hcblc1NTUYIyxji6Ghoaoq6ujoKCAgwcPLmlTX19P\nbm4uhw8ftuoiIrKxbhgQ3/72t3n55ZfJyfls01QqhcfjAcDj8ZBMJoHFnb3X67W2c7vdJBIJRkZG\nKCwstOo+n4++vj5gMWx8Ph8A+fn5jI+PMz8/vw5TExGRtbhnpSd/9KMfUVhYiN/vJx6PW/VPjyRW\n49PTUp9njLHqxpgl/a3Ud3Nzs/V1KBQiFAqtehwiIttBPB5fsr9eixUD4l//9V956623eOedd7h6\n9SrT09M8//zzBAIBMpkMfr+fTCZDIBAAIBgM0t3dbbUfHBwkEAiQl5fH+Pi4VU+n0wSDQatNOp3G\n7XYzOTlJUVERTqfTdjyfDwgREbneF395PnHixC33teIppu9+97t89NFHfPDBB3R0dLBv3z7+4R/+\ngWAwSCwWY25ujlgsRnV1NQBVVVV0dXUxOjpKPB4nJyeHvLw8YPFUVEdHBxMTE3R2di4JiPb2dmZn\nZzl79qzVl4iIbKybug/i09NCR44cYXR0FLfbzf/8z//wx3/8xwAUFRVx5MgR9u3bx5/8yZ/w6quv\nWm2///3v81d/9VcEAgH27t3Ll7/8ZQAOHDjAjh078Hq9vPvuu7z00kvrNTcREVkDh7mZCwobyOFw\n3NS1j81kMVjtxr515yQiW8Na9p26k1pERGwpIERExJYCQkREbCkgRETElgJCRERsKSBERMSWAkJE\nRGwpIERExJYCQkREbCkgRETElgJCRERsKSBERMSWAkJERGwpIERExJYCQkREbCkgRETElgJCRERs\nKSBERMSWAkJERGytGBBXr14lGAyyZ88eqqureeWVVwDIZrNEIhFcLhe1tbXMzMxYbVpaWqioqMDn\n89Hb22vVM5kMlZWVlJeXc/z4cau+sLBANBqltLSUUCjE2NjYes9RRERuwYoB8aUvfYl/+Zd/4T/+\n4z/o6enhjTfeYHh4mLa2NlwuF8PDwxQXF3PmzBkALl++TGtrK+fPn6etrY3Gxkarr6amJo4ePUoq\nlaKnp4f+/n4AOjs7mZqaIpPJEA6HOXny5G2croiIrNYNTzHl5uYCMDMzwyeffILT6SSZTBKNRnE6\nnTQ0NJBIJABIJBKEw2FcLhc1NTUYY6yji6GhIerq6igoKODgwYNL2tTX15Obm8vhw4etuoiIbKwb\nBsSvfvUrdu/eTVFREd/61rdwuVykUik8Hg8AHo+HZDIJLO7svV6v1dbtdpNIJBgZGaGwsNCq+3w+\n+vr6AEgmk/h8PgDy8/MZHx9nfn5+/WYoIiK35J4bbZCTk8NPf/pTPvzwQ5555hm+8pWvYIxZ9Q9w\nOBzX1YwxVt0Ys6S/lfpubm62vg6FQoRCoVWPQ0RkO4jH48Tj8XXp64YB8amysjKeeeYZEokEgUCA\nTCaD3+8nk8kQCAQACAaDdHd3W20GBwcJBALk5eUxPj5u1dPpNMFg0GqTTqdxu91MTk5SVFSE0+m0\nHcPnA0JERK73xV+eT5w4cct9rXiKaWJigv/93/8F4Be/+AX//M//TCQSIRgMEovFmJubIxaLUV1d\nDUBVVRVdXV2Mjo4Sj8fJyckhLy8PWDwV1dHRwcTEBJ2dnUsCor29ndnZWc6ePWv1JSIiG8ys4P33\n3zd+v9888cQT5umnnzZ/93d/Z4wxZnp62uzfv9+UlJSYSCRistms1eb06dNm165dxuv1mgsXLlj1\ngYEB4/f7TVlZmTl27JhVv3btmjl06JApKSkxNTU15tKlS7ZjucFQNzXAgLF5bN05icjWsJb9jOPX\nHWx6Dofjpq59bCaL11vsxr515yQiW8Na9p26k1pERGwpIERExJYCQkREbCkgRETElgJCRERsKSBE\nRMSWAkJERGwpIERExJYCQkREbCkgRETElgJCRERsKSBERMSWAkJERGwpIERExJYCQkREbCkgRETE\nlgJCRERsKSBERMSWAkJERGytGBAfffQRTz31FI8++iihUIg333wTgGw2SyQSweVyUVtby8zMjNWm\npaWFiooKfD4fvb29Vj2TyVBZWUl5eTnHjx+36gsLC0SjUUpLSwmFQoyNja33HEVE5BasGBD33nsv\nr7zyCgMDA/zwhz/kpZdeIpvN0tbWhsvlYnh4mOLiYs6cOQPA5cuXaW1t5fz587S1tdHY2Gj11dTU\nxNGjR0mlUvT09NDf3w9AZ2cnU1NTZDIZwuEwJ0+evI3TFRGR1VoxIB5++GH27NkDwEMPPcSjjz5K\nKpUimUwSjUZxOp00NDSQSCQASCQShMNhXC4XNTU1GGOso4uhoSHq6uooKCjg4MGDS9rU19eTm5vL\n4cOHrbqIiGysVV+DGBkZYWBggKqqKlKpFB6PBwCPx0MymQQWd/Zer9dq43a7SSQSjIyMUFhYaNV9\nPh99fX0AJJNJfD4fAPn5+YyPjzM/P7/2mYmIyJrcs5qNstksdXV1vPLKK9x///0YY1b9AxwOx3U1\nY4xVN8Ys6W+lvpubm62vQ6EQoVBo1eMQEdkO4vE48Xh8Xfq6YUAsLCzwta99jeeff55IJAJAIBAg\nk8ng9/vJZDIEAgEAgsEg3d3dVtvBwUECgQB5eXmMj49b9XQ6TTAYtNqk02ncbjeTk5MUFRXhdDpt\nx/L5gBARket98ZfnEydO3HJfK55iMsYQjUZ57LHHePHFF616MBgkFosxNzdHLBajuroagKqqKrq6\nuhgdHSUej5OTk0NeXh6weCqqo6ODiYkJOjs7lwREe3s7s7OznD171upLREQ2mFnBe++9ZxwOh9m9\ne7fZs2eP2bNnjzl37pyZnp42+/fvNyUlJSYSiZhsNmu1OX36tNm1a5fxer3mwoULVn1gYMD4/X5T\nVlZmjh07ZtWvXbtmDh06ZEpKSkxNTY25dOmS7VhuMNRNDTBgbB5bd04isjWsZT/j+HUHm57D4bip\nax+byeL1Fruxb905icjWsJZ9p+6kFhERWwoIERGxpYAQERFbCggREbGlgBAREVsKCBERsaWAEBER\nWwoIERGxpYAQERFbCggREbGlgBAREVsKCBERsaWAEBERWwoIERGxpYAQERFbCggREbGlgBAREVsK\nCBERsaWAEBERWysGRENDA0VFRTz++ONWLZvNEolEcLlc1NbWMjMzYz3X0tJCRUUFPp+P3t5eq57J\nZKisrKS8vJzjx49b9YWFBaLRKKWlpYRCIcbGxtZzbiIisgYrBsShQ4d49913l9Ta2tpwuVwMDw9T\nXFzMmTNnALh8+TKtra2cP3+etrY2GhsbrTZNTU0cPXqUVCpFT08P/f39AHR2djI1NUUmkyEcDnPy\n5Mn1np+IiNyiFQNi7969PPjgg0tqyWSSaDSK0+mkoaGBRCIBQCKRIBwO43K5qKmpwRhjHV0MDQ1R\nV1dHQUEBBw8eXNKmvr6e3NxcDh8+bNVFRGTj3fQ1iFQqhcfjAcDj8ZBMJoHFnb3X67W2c7vdJBIJ\nRkZGKCwstOo+n4++vj5gMWx8Ph8A+fn5jI+PMz8/f+uzERGRdXPPzTYwxqx6W4fDYdv+07oxZkl/\nN+q7ubnZ+joUChEKhVY9FhGR7SAejxOPx9elr5sOiEAgQCaTwe/3k8lkCAQCAASDQbq7u63tBgcH\nCQQC5OXlMT4+btXT6TTBYNBqk06ncbvdTE5OUlRUhNPpXPZnfz4gNpsHHsgnm/3lRg9DRLa5L/7y\nfOLEiVvu66ZPMQWDQWKxGHNzc8RiMaqrqwGoqqqiq6uL0dFR4vE4OTk55OXlAYunojo6OpiYmKCz\ns3NJQLS3tzM7O8vZs2etvjarBx7Ix+Fw2D4Ww8Es8xAR2YLMCp577jmzc+dOc99995ni4mITi8XM\n9PS02b9/vykpKTGRSMRks1lr+9OnT5tdu3YZr9drLly4YNUHBgaM3+83ZWVl5tixY1b92rVr5tCh\nQ6akpMTU1NSYS5cuLTuWGwz1jgAMmGUet/Lcxs9JRO5ua9nPOH7dwabncDhu6vrH7RrD8kcEt/Lc\nxs9JRO5ua9l36k5qERGxpYAQERFbCggby12MXn/3LHvR+4EH8m/DzxMRWT1dg1jmZy13zWC9r0Es\n3+Ze4BPbZ/LyHmR6enKZdiIin9E1iLvSJyz3ttlsNqsjDxG57bZtQKx0T8Pmt1J46GY9EVkf2/YU\n0629ZfVOnmK69ee2yD+piNwBOsUkIiLrTgFx17F/Z5SuTYjIzbrpD+uTze7T6xNLZbNb4dqKiGwm\nOoIQERFbCggREbGlgBAREVsKCBERsaWA2Db0uU8icnP0LqZtw/7dTaB3OImIPR1BiIiILQWEiIjY\nUkAIuj4hInY2RUBcuHABr9dLRUUFr7322rr1u7U/sfVO2rqfDhuPxzd6CJuG1uIzWov1sSkC4k//\n9E95/fXX6e7u5m/+5m+YmJhYl34Xd272Oz5Zrc392U7aEXxGa/EZrcX62PCAmJqaAuD3f//3KS0t\n5emnnyaRSGzwqOQz9kcXm/3IQkTWbsMDIpVK4fF4rO99Ph99fX222+o8+Way/HULh+M+/VuJ3AXu\nivsgstlfrnBdYaXrDbfy3Gbv707/LDsLyz6z8r/VrTlx4sS69reVaS0+o7VYuw0PiEAgwJ//+Z9b\n3w8MDBAOh6/bTn8lTUTkztrwU0w7duwAFt/J9OGHH/LjH/+YYDC4waMSEZENP4IAOH36NC+88AIL\nCws0Njby0EMPbfSQRES2vQ0/ggCoqakhk8kwMjJCY2Pjkudu1z0SW0VZWRlPPPEEfr+fqqoqALLZ\nLJFIBJfLRW1tLTMzMxs8ytujoaGBoqIiHn/8cau20txbWlqoqKjA5/PR29u7EUO+bezWorm5meLi\nYvx+P36/n3PnzlnP3a1r8dFHH/HUU0/x6KOPEgqFePPNN4Ht+bpYbi3W9XVhNrk9e/aYnp4e8+GH\nHxq3222uXLmy0UO6o8rKyswvfvGLJbVTp06Zb33rW+bq1avmm9/8pnn55Zc3aHS314ULF8y///u/\nm8cee8yqLTf38fFx43a7zX//93+beDxu/H7/Rg37trBbi+bmZvPXf/3X1217N6/FpUuXzMWLF40x\nxly5csX8zu/8jpment6Wr4vl1mI9Xxeb4ghiObpHYpH5wgX6ZDJJNBrF6XTS0NBw167J3r17efDB\nB5fUlpt7IpEgHA7jcrmoqanBGEM2m92IYd8WdmsB9m/euJvX4uGHH2bPnj0APPTQQzz66KOkUqlt\n+bpYbi1g/V4XmzogbuYeibuVw+Fg37591NbW8tZbbwFL18Xj8ZBMJjdyiHfUcnNPJBJ4vV5rO7fb\nvS3W5bXXXqO6uppTp05Z/9mTyeS2WIuRkREGBgaoqqra9q+LT9fi0zf4rNfrYlMHhMBPfvITfvrT\nn/K9732PP/uzP2NsbGxbv+X3ZuZ+t3/m1pEjR/jggw/o6uriP//zP3n99dcB+zW629Yim81SV1fH\nK6+8wv3337+tXxefX4vf/M3fXNfXxaYOiEAgwODgoPX9wMAA1dXVGziiO2/nzp0AeL1e9u/fz9tv\nv00gECCTyQCQyWQIBAIbOcQ7arm5B4NB0um0td3g4OBdvy6FhYU4HA527NjBN7/5TTo7O4G7fy0W\nFhb42te+xvPPP08kEgG27+vCbi3W83WxqQNiu98j8fHHH1uHh1euXKGrq4twOEwwGCQWizE3N0cs\nFttWobnc3Kuqqujq6mJ0dJR4PE5OTg55eXkbPNrb69KlSwB88sknvPnmmzzzzDPA3b0Wxhii0SiP\nPfYYL774olXfjq+L5dZiXV8X63Ax/baKx+PG4/GYXbt2mVdffXWjh3NH/dd//ZfZvXu32b17t9m3\nb5954403jDHGTE9Pm/3795uSkhITiURMNpvd4JHeHs8995zZuXOnue+++0xxcbGJxWIrzv306dNm\n165dxuuaw79uAAAAiElEQVT1mgsXLmzgyNffp2tx7733muLiYvPGG2+Y559/3jz++OPmySefNN/+\n9reXvNvtbl2L9957zzgcDrN7926zZ88es2fPHnPu3Llt+bqwW4t33nlnXV8XDmO28QltERFZ1qY+\nxSQiIhtHASEiIrYUECIiYksBISIithQQIiJiSwEhIiK2/h+wHcz8S9gnYQAAAABJRU5ErkJggg==\n"
}
],
"prompt_number": 66
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"New mean"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#Find the new mean\n",
"mean_usa_new = df_users_new['age'].mean()\n",
"mean_usa_new"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 67,
"text": [
"37.895146669452686"
]
}
],
"prompt_number": 67
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"New standard deviation"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#find new standard deviation\n",
"std_usa_new = df_users_new['age'].std(axis=1)\n",
"std_usa_new"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 68,
"text": [
"11.189870572543851"
]
}
],
"prompt_number": 68
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#calculating z score\n",
"def zscore_new(age):\n",
" return (float(age)-mean_usa_new)/std_usa_new"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 69
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"z-score calculation after replacing missing age data"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#create a separate coumn in dataframe to calculate z-score\n",
"df_users_new['z_index']=df_users_new['age'].apply(zscore_new)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 70
},
{
"cell_type": "heading",
"level": 4,
"metadata": {},
"source": [
"Finding records which lie within -3.0 to +3.0 standard deviation"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#find minimum age that fall within standard devaition -3.0 to 3.0\n",
"df_users_new[df_users_new['z_index']>=-3.0][['age']].min()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 72,
"text": [
"age 5"
]
}
],
"prompt_number": 72
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#find max age that fall within standard devaition -3.0 to 3.0\n",
"df_users_new[df_users_new['z_index']<=3.0][['age']].max()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 73,
"text": [
"age 71"
]
}
],
"prompt_number": 73
},
{
"cell_type": "heading",
"level": 3,
"metadata": {},
"source": [
"NON USA LOCATION ANALYSIS"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"query_nonus = \"SELECT u.UserID,u.location, u.age from BXUsers u where location not like '%usa%'\"\n",
"print query_nonus\n",
"c.execute(query_nonus)\n",
"rows_nonus = c.fetchall()\n",
"df_usersnonus = DataFrame(rows_nonus,columns=['UserId','location','age'])\n",
"df_usersnonus.set_index('UserId')"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"SELECT u.UserID,u.location, u.age from BXUsers u where location not like '%usa%'\n"
]
},
{
"output_type": "pyout",
"prompt_number": 74,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 138887 entries, 3 to 278858\n",
"Data columns:\n",
"location 138887 non-null values\n",
"age 91427 non-null values\n",
"dtypes: float64(1), object(1)"
]
}
],
"prompt_number": 74
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"mean_nonus= df_usersnonus['age'].mean()\n",
"mean_nonus"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 76,
"text": [
"32.115173854550626"
]
}
],
"prompt_number": 76
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"std_nonus= df_usersnonus['age'].std()\n",
"std_nonus"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 78,
"text": [
"13.25913759334459"
]
}
],
"prompt_number": 78
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#query to replace missing age by mean age\n",
"query_upd_nonus = \"update BXUsers set age=\"+str(mean_nonus)+\" where age is null and location not like '%usa%'\"\n",
"print query_upd_nonus"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"update BXUsers set age=32.1151738546 where age is null and location not like '%usa%'\n"
]
}
],
"prompt_number": 79
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#replacing missing value by mean age\n",
"c.execute(query_upd_nonus)\n",
"con.commit()"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 80
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"def z_score_nonus(age):\n",
" return (float(age)-mean_nonus)/std_nonus"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 81
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#create a separate coumn in dataframe to calculate z-score\n",
"df_usersnonus['z_index']=df_usersnonus['age'].apply(z_score_nonus)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 82
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#find minimum age that fall within standard devaition -3.0 to 3.0\n",
"df_usersnonus[df_usersnonus['z_index']>=-3.0][['age']].min()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 83,
"text": [
"age 0"
]
}
],
"prompt_number": 83
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"#find max age that fall within standard devaition -3.0 to 3.0\n",
"df_usersnonus[df_usersnonus['z_index']<=3.0][['age']].max()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 84,
"text": [
"age 71"
]
}
],
"prompt_number": 84
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
},
{
"cell_type": "code",
"collapsed": false,
"input": [],
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment