adek05/IFMSA - ZPK.ipynb

## gistfile1.txt
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 509,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import plotly\n",
    "import cufflinks as cf\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 510,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import io\n",
    "import requests\n",
    "\n",
    "EXAMS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/exams.csv'\n",
    "PATIENTS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/patients.csv'\n",
    "\n",
    "patients_stream = requests.get(PATIENTS_URL).content\n",
    "patients = pd.read_csv(io.StringIO(patients_stream.decode('utf-8')))\n",
    "# patients = pd.read_csv('patients.csv', delimiter=',')\n",
    "\n",
    "exams_stream = requests.get(EXAMS_URL).content\n",
    "patients = pd.read_csv(io.StringIO(exams_stream.decode('utf-8')))\n",
    "# exams = pd.read_csv('exams.csv', delimiter=',')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 511,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cities = ['Szczecin', 'Wrocław', 'Lublin', 'Katowice', 'Gdańsk', 'Olsztyn', 'Kielce', 'Rzeszów']\n",
    "\n",
    "def extract_city_from_regional_id(regional_id):\n",
    "    global cities\n",
    "    matched_cities = [city for city in cities if city in regional_id]\n",
    "    if len(matched_cities) == 0:\n",
    "        return 'Unknown'\n",
    "    return matched_cities[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 512,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def extract_id_from_regional_id(regional_id):\n",
    "    match = re.match('.*?(\\d+$)', regional_id)\n",
    "    if not match:\n",
    "        return '0'\n",
    "    return match.group(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 513,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Unify Regional id\n",
    "patients['unique_ids'] = patients['regional_id'].apply(\n",
    "    lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))\n",
    "exams['unique_ids'] = exams['regional_id'].apply(\n",
    "    lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 514,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Extract city for each patient\n",
    "patients['city'] = patients['regional_id'].apply(lambda x: extract_city_from_regional_id(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 515,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "(\"'Series' object has no attribute 'height'\", 'occurred at index 0')",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m   4150\u001b[0m                     \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4151\u001b[0m                         \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4152\u001b[0;31m                     \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4153\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4154\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m   4246\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4247\u001b[0m                 \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4248\u001b[0;31m                     \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4249\u001b[0m                     \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4250\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   2742\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2743\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2744\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2745\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2746\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: (\"'Series' object has no attribute 'height'\", 'occurred at index 0')"
     ]
    }
   ],
   "source": [
    "# Calculate BMI for each patient\n",
    "patients['bmi'] = patients.apply(lambda row: row.weight / (row.height/100)**2 if row.height != 0 else 0, axis=1)\n",
    "\n",
    "bmi_buckets = ['wychudzenie', 'niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III']\n",
    "\n",
    "patients['bmi_bucket'] = pd.cut(patients['bmi'], bins=[0, 17, 18.5, 25, 30, 35, 40, 100], right=True, labels=bmi_buckets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Filter incomplete data rows\n",
    "patients = patients[~patients.unique_ids.str.contains('Unknown')]\n",
    "exams = exams[~exams.unique_ids.str.contains('Unknown')]\n",
    "exams = exams[~exams.regional_id.str.contains('test')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Select columns which matter\n",
    "patients = pd.DataFrame(patients, columns=['unique_ids', 'city', 'age', 'sex', 'smoker', 'bmi', 'bmi_bucket'])\n",
    "exams = pd.DataFrame(exams, columns=['exam_name', 'result', 'unique_ids'])\n",
    "\n",
    "# Select unique patients\n",
    "patients = patients.drop_duplicates(subset=['unique_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Prepare Glucose Results data\n",
    "glucose = pd.DataFrame(exams[exams.exam_name.str.contains('GlucoseLevel')])\n",
    "glucose['glucose_level'] = glucose['result'].apply(lambda x: int(x))\n",
    "glucose = pd.DataFrame(glucose, columns=['unique_ids', 'glucose_level', 'exam_name'])\n",
    "\n",
    "glucose_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelYes'])\n",
    "glucose_less_than_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelNo'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Prepare Blood Pressure Results data\n",
    "def extract_systolic(data):\n",
    "    return int(re.match('(\\d+)/\\d+', data).group(1))\n",
    "def extract_diastolic(data):\n",
    "    return int(re.match('\\d+/(\\d+)', data).group(1))\n",
    "\n",
    "blood_pressure = pd.DataFrame(exams[exams.exam_name == 'BloodPressureTest'])\n",
    "blood_pressure['systolic'] = blood_pressure['result'].apply(extract_systolic)\n",
    "blood_pressure['diastolic'] = blood_pressure['result'].apply(extract_diastolic)\n",
    "blood_pressure = pd.DataFrame(blood_pressure, columns=['unique_ids', 'systolic', 'diastolic'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Join datasets\n",
    "patients_blood_pressure = pd.merge(blood_pressure, patients, on='unique_ids')\n",
    "patients_glucose_2h_after_meal = pd.merge(glucose_2h_after_meal, patients, on='unique_ids')\n",
    "patients_glucose_less_than_2h_after_meal = pd.merge(glucose_less_than_2h_after_meal, patients, on='unique_ids')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_number_of_patients = len(patients)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## W sumie przebadano"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_number_of_patients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients.groupby('city').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cf.set_config_file(world_readable=True,offline=True)\n",
    "city_count = pd.DataFrame({'count': patients.groupby('city', as_index=False).size()}).reset_index()\n",
    "city_count\n",
    "city_count.iplot(kind='pie', labels='city', values='count', textinfo='value', sort=True, colorscale='blues')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Ciśnienie zmierzono\n",
    "len(patients_blood_pressure)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Cukier 2h po jedzeniu zmierzono\n",
    "len(patients_glucose_2h_after_meal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Cukier mniej niż 2h po jedzeniu zmierzono\n",
    "len(patients_glucose_less_than_2h_after_meal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "sns.jointplot(\"diastolic\", \"systolic\", data=patients_blood_pressure, kind=\"scatter\", ylim=(80, 200), xlim=(30,140))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Smoker vs. Nonsmoker\n",
    "%matplotlib inline\n",
    "\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='smoker', size=5, markers='o', palette=\"PuOr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_blood_pressure['age_bucket'] = pd.cut(patients_blood_pressure['age'], bins=[18, 29, 39, 49, 65, 100], right=False)\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='age_bucket', size=5, markers='o', palette=\"PuOr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_glucose_2h_after_meal['age_bucket'] = \\\n",
    "  pd.cut(patients_glucose_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
    "\n",
    "# sugar_2h_after_rows.sort_values(by='glucose_yes', ascending=False)\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.15)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_glucose_less_than_2h_after_meal['age_bucket'] = \\\n",
    "  pd.cut(patients_glucose_less_than_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.15)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 120, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax = ax)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax=ax)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and hand pick labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.boxplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, hue='city', fliersize=10, whis=0.9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Ideas:\n",
    "# Wrocław vs. Szczecin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## BMI\n",
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"bmi_bucket\", y=\"glucose_level\", \n",
    "              data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker',\n",
    "              split=True, ax=ax,\n",
    "              order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "colors = ['blue', 'green', 'yellow', 'orange', 'red', 'dark red']\n",
    "# colors = ['blue', 'green', 'red', 'red', 'red', 'red']\n",
    "\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'],\n",
    "             hue='bmi_bucket',\n",
    "             size=8, markers='o', palette=sns.xkcd_palette(colors),\n",
    "             hue_order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## IFMSA - ZPK.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 509,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import plotly\n",
    "import cufflinks as cf\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 510,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import io\n",
    "import requests\n",
    "\n",
    "EXAMS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/exams.csv'\n",
    "PATIENTS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/patients.csv'\n",
    "\n",
    "patients_stream = requests.get(PATIENTS_URL).content\n",
    "patients = pd.read_csv(io.StringIO(patients_stream.decode('utf-8')))\n",
    "# patients = pd.read_csv('patients.csv', delimiter=',')\n",
    "\n",
    "exams_stream = requests.get(EXAMS_URL).content\n",
    "exams = pd.read_csv(io.StringIO(exams_stream.decode('utf-8')))\n",
    "# exams = pd.read_csv('exams.csv', delimiter=',')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 511,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cities = ['Szczecin', 'Wrocław', 'Lublin', 'Katowice', 'Gdańsk', 'Olsztyn', 'Kielce', 'Rzeszów']\n",
    "\n",
    "def extract_city_from_regional_id(regional_id):\n",
    "    global cities\n",
    "    matched_cities = [city for city in cities if city in regional_id]\n",
    "    if len(matched_cities) == 0:\n",
    "        return 'Unknown'\n",
    "    return matched_cities[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 512,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def extract_id_from_regional_id(regional_id):\n",
    "    match = re.match('.*?(\\d+$)', regional_id)\n",
    "    if not match:\n",
    "        return '0'\n",
    "    return match.group(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 513,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Unify Regional id\n",
    "patients['unique_ids'] = patients['regional_id'].apply(\n",
    "    lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))\n",
    "exams['unique_ids'] = exams['regional_id'].apply(\n",
    "    lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 514,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Extract city for each patient\n",
    "patients['city'] = patients['regional_id'].apply(lambda x: extract_city_from_regional_id(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 515,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "(\"'Series' object has no attribute 'height'\", 'occurred at index 0')",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m   4150\u001b[0m                     \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4151\u001b[0m                         \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4152\u001b[0;31m                     \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4153\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4154\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m   4246\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4247\u001b[0m                 \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4248\u001b[0;31m                     \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4249\u001b[0m                     \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4250\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   2742\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2743\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2744\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2745\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2746\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: (\"'Series' object has no attribute 'height'\", 'occurred at index 0')"
     ]
    }
   ],
   "source": [
    "# Calculate BMI for each patient\n",
    "patients['bmi'] = patients.apply(lambda row: row.weight / (row.height/100)**2 if row.height != 0 else 0, axis=1)\n",
    "\n",
    "bmi_buckets = ['wychudzenie', 'niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III']\n",
    "\n",
    "patients['bmi_bucket'] = pd.cut(patients['bmi'], bins=[0, 17, 18.5, 25, 30, 35, 40, 100], right=True, labels=bmi_buckets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Filter incomplete data rows\n",
    "patients = patients[~patients.unique_ids.str.contains('Unknown')]\n",
    "exams = exams[~exams.unique_ids.str.contains('Unknown')]\n",
    "exams = exams[~exams.regional_id.str.contains('test')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Select columns which matter\n",
    "patients = pd.DataFrame(patients, columns=['unique_ids', 'city', 'age', 'sex', 'smoker', 'bmi', 'bmi_bucket'])\n",
    "exams = pd.DataFrame(exams, columns=['exam_name', 'result', 'unique_ids'])\n",
    "\n",
    "# Select unique patients\n",
    "patients = patients.drop_duplicates(subset=['unique_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Prepare Glucose Results data\n",
    "glucose = pd.DataFrame(exams[exams.exam_name.str.contains('GlucoseLevel')])\n",
    "glucose['glucose_level'] = glucose['result'].apply(lambda x: int(x))\n",
    "glucose = pd.DataFrame(glucose, columns=['unique_ids', 'glucose_level', 'exam_name'])\n",
    "\n",
    "glucose_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelYes'])\n",
    "glucose_less_than_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelNo'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Prepare Blood Pressure Results data\n",
    "def extract_systolic(data):\n",
    "    return int(re.match('(\\d+)/\\d+', data).group(1))\n",
    "def extract_diastolic(data):\n",
    "    return int(re.match('\\d+/(\\d+)', data).group(1))\n",
    "\n",
    "blood_pressure = pd.DataFrame(exams[exams.exam_name == 'BloodPressureTest'])\n",
    "blood_pressure['systolic'] = blood_pressure['result'].apply(extract_systolic)\n",
    "blood_pressure['diastolic'] = blood_pressure['result'].apply(extract_diastolic)\n",
    "blood_pressure = pd.DataFrame(blood_pressure, columns=['unique_ids', 'systolic', 'diastolic'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Join datasets\n",
    "patients_blood_pressure = pd.merge(blood_pressure, patients, on='unique_ids')\n",
    "patients_glucose_2h_after_meal = pd.merge(glucose_2h_after_meal, patients, on='unique_ids')\n",
    "patients_glucose_less_than_2h_after_meal = pd.merge(glucose_less_than_2h_after_meal, patients, on='unique_ids')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_number_of_patients = len(patients)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## W sumie przebadano"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_number_of_patients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients.groupby('city').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cf.set_config_file(world_readable=True,offline=True)\n",
    "city_count = pd.DataFrame({'count': patients.groupby('city', as_index=False).size()}).reset_index()\n",
    "city_count\n",
    "city_count.iplot(kind='pie', labels='city', values='count', textinfo='value', sort=True, colorscale='blues')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Ciśnienie zmierzono\n",
    "len(patients_blood_pressure)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Cukier 2h po jedzeniu zmierzono\n",
    "len(patients_glucose_2h_after_meal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Cukier mniej niż 2h po jedzeniu zmierzono\n",
    "len(patients_glucose_less_than_2h_after_meal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "sns.jointplot(\"diastolic\", \"systolic\", data=patients_blood_pressure, kind=\"scatter\", ylim=(80, 200), xlim=(30,140))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Smoker vs. Nonsmoker\n",
    "%matplotlib inline\n",
    "\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='smoker', size=5, markers='o', palette=\"PuOr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_blood_pressure['age_bucket'] = pd.cut(patients_blood_pressure['age'], bins=[18, 29, 39, 49, 65, 100], right=False)\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='age_bucket', size=5, markers='o', palette=\"PuOr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_glucose_2h_after_meal['age_bucket'] = \\\n",
    "  pd.cut(patients_glucose_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
    "\n",
    "# sugar_2h_after_rows.sort_values(by='glucose_yes', ascending=False)\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.15)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_glucose_less_than_2h_after_meal['age_bucket'] = \\\n",
    "  pd.cut(patients_glucose_less_than_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.15)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 120, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax = ax)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax=ax)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and hand pick labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.boxplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, hue='city', fliersize=10, whis=0.9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Ideas:\n",
    "# Wrocław vs. Szczecin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## BMI\n",
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"bmi_bucket\", y=\"glucose_level\", \n",
    "              data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker',\n",
    "              split=True, ax=ax,\n",
    "              order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "colors = ['blue', 'green', 'yellow', 'orange', 'red', 'dark red']\n",
    "# colors = ['blue', 'green', 'red', 'red', 'red', 'red']\n",
    "\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'],\n",
    "             hue='bmi_bucket',\n",
    "             size=8, markers='o', palette=sns.xkcd_palette(colors),\n",
    "             hue_order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}

## IFMSA - ZPK_.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 509,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import plotly\n",
    "import cufflinks as cf\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 510,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import io\n",
    "import requests\n",
    "\n",
    "EXAMS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/exams.csv'\n",
    "PATIENTS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/patients.csv'\n",
    "\n",
    "patients_stream = requests.get(PATIENTS_URL).content\n",
    "patients = pd.read_csv(io.StringIO(patients_stream.decode('utf-8')))\n",
    "# patients = pd.read_csv('patients.csv', delimiter=',')\n",
    "\n",
    "exams_stream = requests.get(EXAMS_URL).content\n",
    "exams = pd.read_csv(io.StringIO(exams_stream.decode('utf-8')))\n",
    "# exams = pd.read_csv('exams.csv', delimiter=',')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 511,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cities = ['Szczecin', 'Wrocław', 'Lublin', 'Katowice', 'Gdańsk', 'Olsztyn', 'Kielce', 'Rzeszów']\n",
    "\n",
    "def extract_city_from_regional_id(regional_id):\n",
    "    global cities\n",
    "    matched_cities = [city for city in cities if city in regional_id]\n",
    "    if len(matched_cities) == 0:\n",
    "        return 'Unknown'\n",
    "    return matched_cities[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 512,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def extract_id_from_regional_id(regional_id):\n",
    "    match = re.match('.*?(\\d+$)', regional_id)\n",
    "    if not match:\n",
    "        return '0'\n",
    "    return match.group(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 513,
   "metadata": {
    "collapsed": false,
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# Unify Regional id\n",
    "patients['unique_ids'] = patients['regional_id'].apply(\n",
    "    lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))\n",
    "exams['unique_ids'] = exams['regional_id'].apply(\n",
    "    lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 514,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Extract city for each patient\n",
    "patients['city'] = patients['regional_id'].apply(lambda x: extract_city_from_regional_id(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 515,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "(\"'Series' object has no attribute 'height'\", 'occurred at index 0')",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m   4150\u001b[0m                     \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4151\u001b[0m                         \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4152\u001b[0;31m                     \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4153\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4154\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m   4246\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4247\u001b[0m                 \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4248\u001b[0;31m                     \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   4249\u001b[0m                     \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   4250\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m   2742\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2743\u001b[0m                 \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2744\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   2745\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   2746\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mAttributeError\u001b[0m: (\"'Series' object has no attribute 'height'\", 'occurred at index 0')"
     ]
    }
   ],
   "source": [
    "# Calculate BMI for each patient\n",
    "patients['bmi'] = patients.apply(lambda row: row.weight / (row.height/100)**2 if row.height != 0 else 0, axis=1)\n",
    "\n",
    "bmi_buckets = ['wychudzenie', 'niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III']\n",
    "\n",
    "patients['bmi_bucket'] = pd.cut(patients['bmi'], bins=[0, 17, 18.5, 25, 30, 35, 40, 100], right=True, labels=bmi_buckets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Filter incomplete data rows\n",
    "patients = patients[~patients.unique_ids.str.contains('Unknown')]\n",
    "exams = exams[~exams.unique_ids.str.contains('Unknown')]\n",
    "exams = exams[~exams.regional_id.str.contains('test')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Select columns which matter\n",
    "patients = pd.DataFrame(patients, columns=['unique_ids', 'city', 'age', 'sex', 'smoker', 'bmi', 'bmi_bucket'])\n",
    "exams = pd.DataFrame(exams, columns=['exam_name', 'result', 'unique_ids'])\n",
    "\n",
    "# Select unique patients\n",
    "patients = patients.drop_duplicates(subset=['unique_ids'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Prepare Glucose Results data\n",
    "glucose = pd.DataFrame(exams[exams.exam_name.str.contains('GlucoseLevel')])\n",
    "glucose['glucose_level'] = glucose['result'].apply(lambda x: int(x))\n",
    "glucose = pd.DataFrame(glucose, columns=['unique_ids', 'glucose_level', 'exam_name'])\n",
    "\n",
    "glucose_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelYes'])\n",
    "glucose_less_than_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelNo'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Prepare Blood Pressure Results data\n",
    "def extract_systolic(data):\n",
    "    return int(re.match('(\\d+)/\\d+', data).group(1))\n",
    "def extract_diastolic(data):\n",
    "    return int(re.match('\\d+/(\\d+)', data).group(1))\n",
    "\n",
    "blood_pressure = pd.DataFrame(exams[exams.exam_name == 'BloodPressureTest'])\n",
    "blood_pressure['systolic'] = blood_pressure['result'].apply(extract_systolic)\n",
    "blood_pressure['diastolic'] = blood_pressure['result'].apply(extract_diastolic)\n",
    "blood_pressure = pd.DataFrame(blood_pressure, columns=['unique_ids', 'systolic', 'diastolic'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Join datasets\n",
    "patients_blood_pressure = pd.merge(blood_pressure, patients, on='unique_ids')\n",
    "patients_glucose_2h_after_meal = pd.merge(glucose_2h_after_meal, patients, on='unique_ids')\n",
    "patients_glucose_less_than_2h_after_meal = pd.merge(glucose_less_than_2h_after_meal, patients, on='unique_ids')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_number_of_patients = len(patients)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## W sumie przebadano"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "total_number_of_patients"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients.groupby('city').size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "cf.set_config_file(world_readable=True,offline=True)\n",
    "city_count = pd.DataFrame({'count': patients.groupby('city', as_index=False).size()}).reset_index()\n",
    "city_count\n",
    "city_count.iplot(kind='pie', labels='city', values='count', textinfo='value', sort=True, colorscale='blues')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Ciśnienie zmierzono\n",
    "len(patients_blood_pressure)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## Cukier 2h po jedzeniu zmierzono\n",
    "len(patients_glucose_2h_after_meal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Cukier mniej niż 2h po jedzeniu zmierzono\n",
    "len(patients_glucose_less_than_2h_after_meal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "sns.jointplot(\"diastolic\", \"systolic\", data=patients_blood_pressure, kind=\"scatter\", ylim=(80, 200), xlim=(30,140))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "## Smoker vs. Nonsmoker\n",
    "%matplotlib inline\n",
    "\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='smoker', size=5, markers='o', palette=\"PuOr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_blood_pressure['age_bucket'] = pd.cut(patients_blood_pressure['age'], bins=[18, 29, 39, 49, 65, 100], right=False)\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='age_bucket', size=5, markers='o', palette=\"PuOr\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_glucose_2h_after_meal['age_bucket'] = \\\n",
    "  pd.cut(patients_glucose_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
    "\n",
    "# sugar_2h_after_rows.sort_values(by='glucose_yes', ascending=False)\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.15)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "patients_glucose_less_than_2h_after_meal['age_bucket'] = \\\n",
    "  pd.cut(patients_glucose_less_than_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.15)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 120, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax = ax)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax=ax)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and hand pick labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.boxplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, hue='city', fliersize=10, whis=0.9)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "## Ideas:\n",
    "# Wrocław vs. Szczecin"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "## BMI\n",
    "f, ax = plt.subplots(figsize=(9, 9))\n",
    "\n",
    "# Log scale and manually picked labels\n",
    "ax.set(yscale='log')\n",
    "\n",
    "y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
    "plt.semilogy(y, y)\n",
    "plt.yticks(y, y)\n",
    "\n",
    "# Horizontal line for 200\n",
    "plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
    "\n",
    "sns.stripplot(x=\"bmi_bucket\", y=\"glucose_level\", \n",
    "              data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker',\n",
    "              split=True, ax=ax,\n",
    "              order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "colors = ['blue', 'green', 'yellow', 'orange', 'red', 'dark red']\n",
    "# colors = ['blue', 'green', 'red', 'red', 'red', 'red']\n",
    "\n",
    "sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'],\n",
    "             hue='bmi_bucket',\n",
    "             size=8, markers='o', palette=sns.xkcd_palette(colors),\n",
    "             hue_order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 509,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"import plotly\n",
	"import cufflinks as cf\n",
	"import seaborn as sns\n",
	"import matplotlib.pyplot as plt"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 510,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import io\n",
	"import requests\n",
	"\n",
	"EXAMS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/exams.csv'\n",
	"PATIENTS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/patients.csv'\n",
	"\n",
	"patients_stream = requests.get(PATIENTS_URL).content\n",
	"patients = pd.read_csv(io.StringIO(patients_stream.decode('utf-8')))\n",
	"# patients = pd.read_csv('patients.csv', delimiter=',')\n",
	"\n",
	"exams_stream = requests.get(EXAMS_URL).content\n",
	"patients = pd.read_csv(io.StringIO(exams_stream.decode('utf-8')))\n",
	"# exams = pd.read_csv('exams.csv', delimiter=',')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 511,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"cities = ['Szczecin', 'Wrocław', 'Lublin', 'Katowice', 'Gdańsk', 'Olsztyn', 'Kielce', 'Rzeszów']\n",
	"\n",
	"def extract_city_from_regional_id(regional_id):\n",
	" global cities\n",
	" matched_cities = [city for city in cities if city in regional_id]\n",
	" if len(matched_cities) == 0:\n",
	" return 'Unknown'\n",
	" return matched_cities[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 512,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import re\n",
	"\n",
	"def extract_id_from_regional_id(regional_id):\n",
	" match = re.match('.*?(\\d+$)', regional_id)\n",
	" if not match:\n",
	" return '0'\n",
	" return match.group(1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 513,
	"metadata": {
	"collapsed": false,
	"scrolled": false
	},
	"outputs": [],
	"source": [
	"# Unify Regional id\n",
	"patients['unique_ids'] = patients['regional_id'].apply(\n",
	" lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))\n",
	"exams['unique_ids'] = exams['regional_id'].apply(\n",
	" lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 514,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Extract city for each patient\n",
	"patients['city'] = patients['regional_id'].apply(lambda x: extract_city_from_regional_id(x))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 515,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"ename": "AttributeError",
	"evalue": "(\"'Series' object has no attribute 'height'\", 'occurred at index 0')",
	"output_type": "error",
	"traceback": [
	"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
	"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
	"\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m 4150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4151\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4152\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4153\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4154\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m 4246\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4247\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4248\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4249\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4250\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 2742\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2743\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2744\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2745\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2746\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
	"\u001b[0;31mAttributeError\u001b[0m: (\"'Series' object has no attribute 'height'\", 'occurred at index 0')"
	]
	}
	],
	"source": [
	"# Calculate BMI for each patient\n",
	"patients['bmi'] = patients.apply(lambda row: row.weight / (row.height/100)**2 if row.height != 0 else 0, axis=1)\n",
	"\n",
	"bmi_buckets = ['wychudzenie', 'niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III']\n",
	"\n",
	"patients['bmi_bucket'] = pd.cut(patients['bmi'], bins=[0, 17, 18.5, 25, 30, 35, 40, 100], right=True, labels=bmi_buckets)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Filter incomplete data rows\n",
	"patients = patients[~patients.unique_ids.str.contains('Unknown')]\n",
	"exams = exams[~exams.unique_ids.str.contains('Unknown')]\n",
	"exams = exams[~exams.regional_id.str.contains('test')]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Select columns which matter\n",
	"patients = pd.DataFrame(patients, columns=['unique_ids', 'city', 'age', 'sex', 'smoker', 'bmi', 'bmi_bucket'])\n",
	"exams = pd.DataFrame(exams, columns=['exam_name', 'result', 'unique_ids'])\n",
	"\n",
	"# Select unique patients\n",
	"patients = patients.drop_duplicates(subset=['unique_ids'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# Prepare Glucose Results data\n",
	"glucose = pd.DataFrame(exams[exams.exam_name.str.contains('GlucoseLevel')])\n",
	"glucose['glucose_level'] = glucose['result'].apply(lambda x: int(x))\n",
	"glucose = pd.DataFrame(glucose, columns=['unique_ids', 'glucose_level', 'exam_name'])\n",
	"\n",
	"glucose_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelYes'])\n",
	"glucose_less_than_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelNo'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Prepare Blood Pressure Results data\n",
	"def extract_systolic(data):\n",
	" return int(re.match('(\\d+)/\\d+', data).group(1))\n",
	"def extract_diastolic(data):\n",
	" return int(re.match('\\d+/(\\d+)', data).group(1))\n",
	"\n",
	"blood_pressure = pd.DataFrame(exams[exams.exam_name == 'BloodPressureTest'])\n",
	"blood_pressure['systolic'] = blood_pressure['result'].apply(extract_systolic)\n",
	"blood_pressure['diastolic'] = blood_pressure['result'].apply(extract_diastolic)\n",
	"blood_pressure = pd.DataFrame(blood_pressure, columns=['unique_ids', 'systolic', 'diastolic'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Join datasets\n",
	"patients_blood_pressure = pd.merge(blood_pressure, patients, on='unique_ids')\n",
	"patients_glucose_2h_after_meal = pd.merge(glucose_2h_after_meal, patients, on='unique_ids')\n",
	"patients_glucose_less_than_2h_after_meal = pd.merge(glucose_less_than_2h_after_meal, patients, on='unique_ids')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"total_number_of_patients = len(patients)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## W sumie przebadano"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"total_number_of_patients"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"patients.groupby('city').size()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"cf.set_config_file(world_readable=True,offline=True)\n",
	"city_count = pd.DataFrame({'count': patients.groupby('city', as_index=False).size()}).reset_index()\n",
	"city_count\n",
	"city_count.iplot(kind='pie', labels='city', values='count', textinfo='value', sort=True, colorscale='blues')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"## Ciśnienie zmierzono\n",
	"len(patients_blood_pressure)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"## Cukier 2h po jedzeniu zmierzono\n",
	"len(patients_glucose_2h_after_meal)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"## Cukier mniej niż 2h po jedzeniu zmierzono\n",
	"len(patients_glucose_less_than_2h_after_meal)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"%matplotlib inline\n",
	"sns.jointplot(\"diastolic\", \"systolic\", data=patients_blood_pressure, kind=\"scatter\", ylim=(80, 200), xlim=(30,140))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false,
	"scrolled": true
	},
	"outputs": [],
	"source": [
	"## Smoker vs. Nonsmoker\n",
	"%matplotlib inline\n",
	"\n",
	"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='smoker', size=5, markers='o', palette=\"PuOr\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"patients_blood_pressure['age_bucket'] = pd.cut(patients_blood_pressure['age'], bins=[18, 29, 39, 49, 65, 100], right=False)\n",
	"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='age_bucket', size=5, markers='o', palette=\"PuOr\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"patients_glucose_2h_after_meal['age_bucket'] = \\\n",
	" pd.cut(patients_glucose_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
	"\n",
	"# sugar_2h_after_rows.sort_values(by='glucose_yes', ascending=False)\n",
	"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.15)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"patients_glucose_less_than_2h_after_meal['age_bucket'] = \\\n",
	" pd.cut(patients_glucose_less_than_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
	"\n",
	"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.15)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"f, ax = plt.subplots(figsize=(9, 9))\n",
	"\n",
	"# Log scale and manually picked labels\n",
	"ax.set(yscale='log')\n",
	"\n",
	"y=[30, 50, 80, 100, 120, 200, 300, 400, 500]\n",
	"plt.semilogy(y, y)\n",
	"plt.yticks(y, y)\n",
	"\n",
	"# Horizontal line for 200\n",
	"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
	"\n",
	"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax = ax)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"f, ax = plt.subplots(figsize=(9, 9))\n",
	"\n",
	"# Log scale and manually picked labels\n",
	"ax.set(yscale='log')\n",
	"\n",
	"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
	"plt.semilogy(y, y)\n",
	"plt.yticks(y, y)\n",
	"\n",
	"# Horizontal line for 200\n",
	"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
	"\n",
	"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax=ax)\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"f, ax = plt.subplots(figsize=(9, 9))\n",
	"\n",
	"# Log scale and hand pick labels\n",
	"ax.set(yscale='log')\n",
	"\n",
	"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
	"plt.semilogy(y, y)\n",
	"plt.yticks(y, y)\n",
	"\n",
	"# Horizontal line for 200\n",
	"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
	"\n",
	"sns.boxplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, hue='city', fliersize=10, whis=0.9)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"## Ideas:\n",
	"# Wrocław vs. Szczecin"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"## BMI\n",
	"f, ax = plt.subplots(figsize=(9, 9))\n",
	"\n",
	"# Log scale and manually picked labels\n",
	"ax.set(yscale='log')\n",
	"\n",
	"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
	"plt.semilogy(y, y)\n",
	"plt.yticks(y, y)\n",
	"\n",
	"# Horizontal line for 200\n",
	"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
	"\n",
	"sns.stripplot(x=\"bmi_bucket\", y=\"glucose_level\", \n",
	" data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker',\n",
	" split=True, ax=ax,\n",
	" order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"colors = ['blue', 'green', 'yellow', 'orange', 'red', 'dark red']\n",
	"# colors = ['blue', 'green', 'red', 'red', 'red', 'red']\n",
	"\n",
	"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'],\n",
	" hue='bmi_bucket',\n",
	" size=8, markers='o', palette=sns.xkcd_palette(colors),\n",
	" hue_order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.0"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}