Skip to content

Instantly share code, notes, and snippets.

@adek05
Last active May 29, 2017 11:03
Show Gist options
  • Save adek05/1dbf5f35f8396059388eeda777378285 to your computer and use it in GitHub Desktop.
Save adek05/1dbf5f35f8396059388eeda777378285 to your computer and use it in GitHub Desktop.
{
"cells": [
{
"cell_type": "code",
"execution_count": 509,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import plotly\n",
"import cufflinks as cf\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 510,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import io\n",
"import requests\n",
"\n",
"EXAMS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/exams.csv'\n",
"PATIENTS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/patients.csv'\n",
"\n",
"patients_stream = requests.get(PATIENTS_URL).content\n",
"patients = pd.read_csv(io.StringIO(patients_stream.decode('utf-8')))\n",
"# patients = pd.read_csv('patients.csv', delimiter=',')\n",
"\n",
"exams_stream = requests.get(EXAMS_URL).content\n",
"patients = pd.read_csv(io.StringIO(exams_stream.decode('utf-8')))\n",
"# exams = pd.read_csv('exams.csv', delimiter=',')\n"
]
},
{
"cell_type": "code",
"execution_count": 511,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cities = ['Szczecin', 'Wrocław', 'Lublin', 'Katowice', 'Gdańsk', 'Olsztyn', 'Kielce', 'Rzeszów']\n",
"\n",
"def extract_city_from_regional_id(regional_id):\n",
" global cities\n",
" matched_cities = [city for city in cities if city in regional_id]\n",
" if len(matched_cities) == 0:\n",
" return 'Unknown'\n",
" return matched_cities[0]"
]
},
{
"cell_type": "code",
"execution_count": 512,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import re\n",
"\n",
"def extract_id_from_regional_id(regional_id):\n",
" match = re.match('.*?(\\d+$)', regional_id)\n",
" if not match:\n",
" return '0'\n",
" return match.group(1)"
]
},
{
"cell_type": "code",
"execution_count": 513,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"# Unify Regional id\n",
"patients['unique_ids'] = patients['regional_id'].apply(\n",
" lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))\n",
"exams['unique_ids'] = exams['regional_id'].apply(\n",
" lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))"
]
},
{
"cell_type": "code",
"execution_count": 514,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Extract city for each patient\n",
"patients['city'] = patients['regional_id'].apply(lambda x: extract_city_from_regional_id(x))"
]
},
{
"cell_type": "code",
"execution_count": 515,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "AttributeError",
"evalue": "(\"'Series' object has no attribute 'height'\", 'occurred at index 0')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m 4150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4151\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4152\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4153\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4154\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m 4246\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4247\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4248\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4249\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4250\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 2742\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2743\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2744\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2745\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2746\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: (\"'Series' object has no attribute 'height'\", 'occurred at index 0')"
]
}
],
"source": [
"# Calculate BMI for each patient\n",
"patients['bmi'] = patients.apply(lambda row: row.weight / (row.height/100)**2 if row.height != 0 else 0, axis=1)\n",
"\n",
"bmi_buckets = ['wychudzenie', 'niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III']\n",
"\n",
"patients['bmi_bucket'] = pd.cut(patients['bmi'], bins=[0, 17, 18.5, 25, 30, 35, 40, 100], right=True, labels=bmi_buckets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Filter incomplete data rows\n",
"patients = patients[~patients.unique_ids.str.contains('Unknown')]\n",
"exams = exams[~exams.unique_ids.str.contains('Unknown')]\n",
"exams = exams[~exams.regional_id.str.contains('test')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Select columns which matter\n",
"patients = pd.DataFrame(patients, columns=['unique_ids', 'city', 'age', 'sex', 'smoker', 'bmi', 'bmi_bucket'])\n",
"exams = pd.DataFrame(exams, columns=['exam_name', 'result', 'unique_ids'])\n",
"\n",
"# Select unique patients\n",
"patients = patients.drop_duplicates(subset=['unique_ids'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Prepare Glucose Results data\n",
"glucose = pd.DataFrame(exams[exams.exam_name.str.contains('GlucoseLevel')])\n",
"glucose['glucose_level'] = glucose['result'].apply(lambda x: int(x))\n",
"glucose = pd.DataFrame(glucose, columns=['unique_ids', 'glucose_level', 'exam_name'])\n",
"\n",
"glucose_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelYes'])\n",
"glucose_less_than_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelNo'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Prepare Blood Pressure Results data\n",
"def extract_systolic(data):\n",
" return int(re.match('(\\d+)/\\d+', data).group(1))\n",
"def extract_diastolic(data):\n",
" return int(re.match('\\d+/(\\d+)', data).group(1))\n",
"\n",
"blood_pressure = pd.DataFrame(exams[exams.exam_name == 'BloodPressureTest'])\n",
"blood_pressure['systolic'] = blood_pressure['result'].apply(extract_systolic)\n",
"blood_pressure['diastolic'] = blood_pressure['result'].apply(extract_diastolic)\n",
"blood_pressure = pd.DataFrame(blood_pressure, columns=['unique_ids', 'systolic', 'diastolic'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Join datasets\n",
"patients_blood_pressure = pd.merge(blood_pressure, patients, on='unique_ids')\n",
"patients_glucose_2h_after_meal = pd.merge(glucose_2h_after_meal, patients, on='unique_ids')\n",
"patients_glucose_less_than_2h_after_meal = pd.merge(glucose_less_than_2h_after_meal, patients, on='unique_ids')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"total_number_of_patients = len(patients)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## W sumie przebadano"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"total_number_of_patients"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients.groupby('city').size()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cf.set_config_file(world_readable=True,offline=True)\n",
"city_count = pd.DataFrame({'count': patients.groupby('city', as_index=False).size()}).reset_index()\n",
"city_count\n",
"city_count.iplot(kind='pie', labels='city', values='count', textinfo='value', sort=True, colorscale='blues')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## Ciśnienie zmierzono\n",
"len(patients_blood_pressure)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## Cukier 2h po jedzeniu zmierzono\n",
"len(patients_glucose_2h_after_meal)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"## Cukier mniej niż 2h po jedzeniu zmierzono\n",
"len(patients_glucose_less_than_2h_after_meal)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"sns.jointplot(\"diastolic\", \"systolic\", data=patients_blood_pressure, kind=\"scatter\", ylim=(80, 200), xlim=(30,140))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"## Smoker vs. Nonsmoker\n",
"%matplotlib inline\n",
"\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='smoker', size=5, markers='o', palette=\"PuOr\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_blood_pressure['age_bucket'] = pd.cut(patients_blood_pressure['age'], bins=[18, 29, 39, 49, 65, 100], right=False)\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='age_bucket', size=5, markers='o', palette=\"PuOr\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_glucose_2h_after_meal['age_bucket'] = \\\n",
" pd.cut(patients_glucose_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
"\n",
"# sugar_2h_after_rows.sort_values(by='glucose_yes', ascending=False)\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.15)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_glucose_less_than_2h_after_meal['age_bucket'] = \\\n",
" pd.cut(patients_glucose_less_than_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.15)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 120, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax = ax)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax=ax)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and hand pick labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.boxplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, hue='city', fliersize=10, whis=0.9)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## Ideas:\n",
"# Wrocław vs. Szczecin"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## BMI\n",
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"bmi_bucket\", y=\"glucose_level\", \n",
" data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker',\n",
" split=True, ax=ax,\n",
" order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"colors = ['blue', 'green', 'yellow', 'orange', 'red', 'dark red']\n",
"# colors = ['blue', 'green', 'red', 'red', 'red', 'red']\n",
"\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'],\n",
" hue='bmi_bucket',\n",
" size=8, markers='o', palette=sns.xkcd_palette(colors),\n",
" hue_order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 509,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import plotly\n",
"import cufflinks as cf\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 510,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import io\n",
"import requests\n",
"\n",
"EXAMS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/exams.csv'\n",
"PATIENTS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/patients.csv'\n",
"\n",
"patients_stream = requests.get(PATIENTS_URL).content\n",
"patients = pd.read_csv(io.StringIO(patients_stream.decode('utf-8')))\n",
"# patients = pd.read_csv('patients.csv', delimiter=',')\n",
"\n",
"exams_stream = requests.get(EXAMS_URL).content\n",
"exams = pd.read_csv(io.StringIO(exams_stream.decode('utf-8')))\n",
"# exams = pd.read_csv('exams.csv', delimiter=',')\n"
]
},
{
"cell_type": "code",
"execution_count": 511,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cities = ['Szczecin', 'Wrocław', 'Lublin', 'Katowice', 'Gdańsk', 'Olsztyn', 'Kielce', 'Rzeszów']\n",
"\n",
"def extract_city_from_regional_id(regional_id):\n",
" global cities\n",
" matched_cities = [city for city in cities if city in regional_id]\n",
" if len(matched_cities) == 0:\n",
" return 'Unknown'\n",
" return matched_cities[0]"
]
},
{
"cell_type": "code",
"execution_count": 512,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import re\n",
"\n",
"def extract_id_from_regional_id(regional_id):\n",
" match = re.match('.*?(\\d+$)', regional_id)\n",
" if not match:\n",
" return '0'\n",
" return match.group(1)"
]
},
{
"cell_type": "code",
"execution_count": 513,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"# Unify Regional id\n",
"patients['unique_ids'] = patients['regional_id'].apply(\n",
" lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))\n",
"exams['unique_ids'] = exams['regional_id'].apply(\n",
" lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))"
]
},
{
"cell_type": "code",
"execution_count": 514,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Extract city for each patient\n",
"patients['city'] = patients['regional_id'].apply(lambda x: extract_city_from_regional_id(x))"
]
},
{
"cell_type": "code",
"execution_count": 515,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "AttributeError",
"evalue": "(\"'Series' object has no attribute 'height'\", 'occurred at index 0')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m 4150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4151\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4152\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4153\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4154\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m 4246\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4247\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4248\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4249\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4250\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 2742\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2743\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2744\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2745\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2746\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: (\"'Series' object has no attribute 'height'\", 'occurred at index 0')"
]
}
],
"source": [
"# Calculate BMI for each patient\n",
"patients['bmi'] = patients.apply(lambda row: row.weight / (row.height/100)**2 if row.height != 0 else 0, axis=1)\n",
"\n",
"bmi_buckets = ['wychudzenie', 'niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III']\n",
"\n",
"patients['bmi_bucket'] = pd.cut(patients['bmi'], bins=[0, 17, 18.5, 25, 30, 35, 40, 100], right=True, labels=bmi_buckets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Filter incomplete data rows\n",
"patients = patients[~patients.unique_ids.str.contains('Unknown')]\n",
"exams = exams[~exams.unique_ids.str.contains('Unknown')]\n",
"exams = exams[~exams.regional_id.str.contains('test')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Select columns which matter\n",
"patients = pd.DataFrame(patients, columns=['unique_ids', 'city', 'age', 'sex', 'smoker', 'bmi', 'bmi_bucket'])\n",
"exams = pd.DataFrame(exams, columns=['exam_name', 'result', 'unique_ids'])\n",
"\n",
"# Select unique patients\n",
"patients = patients.drop_duplicates(subset=['unique_ids'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Prepare Glucose Results data\n",
"glucose = pd.DataFrame(exams[exams.exam_name.str.contains('GlucoseLevel')])\n",
"glucose['glucose_level'] = glucose['result'].apply(lambda x: int(x))\n",
"glucose = pd.DataFrame(glucose, columns=['unique_ids', 'glucose_level', 'exam_name'])\n",
"\n",
"glucose_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelYes'])\n",
"glucose_less_than_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelNo'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Prepare Blood Pressure Results data\n",
"def extract_systolic(data):\n",
" return int(re.match('(\\d+)/\\d+', data).group(1))\n",
"def extract_diastolic(data):\n",
" return int(re.match('\\d+/(\\d+)', data).group(1))\n",
"\n",
"blood_pressure = pd.DataFrame(exams[exams.exam_name == 'BloodPressureTest'])\n",
"blood_pressure['systolic'] = blood_pressure['result'].apply(extract_systolic)\n",
"blood_pressure['diastolic'] = blood_pressure['result'].apply(extract_diastolic)\n",
"blood_pressure = pd.DataFrame(blood_pressure, columns=['unique_ids', 'systolic', 'diastolic'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Join datasets\n",
"patients_blood_pressure = pd.merge(blood_pressure, patients, on='unique_ids')\n",
"patients_glucose_2h_after_meal = pd.merge(glucose_2h_after_meal, patients, on='unique_ids')\n",
"patients_glucose_less_than_2h_after_meal = pd.merge(glucose_less_than_2h_after_meal, patients, on='unique_ids')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"total_number_of_patients = len(patients)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## W sumie przebadano"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"total_number_of_patients"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients.groupby('city').size()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cf.set_config_file(world_readable=True,offline=True)\n",
"city_count = pd.DataFrame({'count': patients.groupby('city', as_index=False).size()}).reset_index()\n",
"city_count\n",
"city_count.iplot(kind='pie', labels='city', values='count', textinfo='value', sort=True, colorscale='blues')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## Ciśnienie zmierzono\n",
"len(patients_blood_pressure)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## Cukier 2h po jedzeniu zmierzono\n",
"len(patients_glucose_2h_after_meal)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"## Cukier mniej niż 2h po jedzeniu zmierzono\n",
"len(patients_glucose_less_than_2h_after_meal)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"sns.jointplot(\"diastolic\", \"systolic\", data=patients_blood_pressure, kind=\"scatter\", ylim=(80, 200), xlim=(30,140))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"## Smoker vs. Nonsmoker\n",
"%matplotlib inline\n",
"\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='smoker', size=5, markers='o', palette=\"PuOr\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_blood_pressure['age_bucket'] = pd.cut(patients_blood_pressure['age'], bins=[18, 29, 39, 49, 65, 100], right=False)\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='age_bucket', size=5, markers='o', palette=\"PuOr\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_glucose_2h_after_meal['age_bucket'] = \\\n",
" pd.cut(patients_glucose_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
"\n",
"# sugar_2h_after_rows.sort_values(by='glucose_yes', ascending=False)\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.15)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_glucose_less_than_2h_after_meal['age_bucket'] = \\\n",
" pd.cut(patients_glucose_less_than_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.15)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 120, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax = ax)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax=ax)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and hand pick labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.boxplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, hue='city', fliersize=10, whis=0.9)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## Ideas:\n",
"# Wrocław vs. Szczecin"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## BMI\n",
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"bmi_bucket\", y=\"glucose_level\", \n",
" data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker',\n",
" split=True, ax=ax,\n",
" order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"colors = ['blue', 'green', 'yellow', 'orange', 'red', 'dark red']\n",
"# colors = ['blue', 'green', 'red', 'red', 'red', 'red']\n",
"\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'],\n",
" hue='bmi_bucket',\n",
" size=8, markers='o', palette=sns.xkcd_palette(colors),\n",
" hue_order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 509,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import plotly\n",
"import cufflinks as cf\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 510,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import io\n",
"import requests\n",
"\n",
"EXAMS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/exams.csv'\n",
"PATIENTS_URL = 'https://gist.githubusercontent.com/adek05/c2218eaf0d3e04717667a3a2dcc256a0/raw/0395dbb19354d6997c9f6d592e2e9e6138caec08/patients.csv'\n",
"\n",
"patients_stream = requests.get(PATIENTS_URL).content\n",
"patients = pd.read_csv(io.StringIO(patients_stream.decode('utf-8')))\n",
"# patients = pd.read_csv('patients.csv', delimiter=',')\n",
"\n",
"exams_stream = requests.get(EXAMS_URL).content\n",
"exams = pd.read_csv(io.StringIO(exams_stream.decode('utf-8')))\n",
"# exams = pd.read_csv('exams.csv', delimiter=',')\n"
]
},
{
"cell_type": "code",
"execution_count": 511,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cities = ['Szczecin', 'Wrocław', 'Lublin', 'Katowice', 'Gdańsk', 'Olsztyn', 'Kielce', 'Rzeszów']\n",
"\n",
"def extract_city_from_regional_id(regional_id):\n",
" global cities\n",
" matched_cities = [city for city in cities if city in regional_id]\n",
" if len(matched_cities) == 0:\n",
" return 'Unknown'\n",
" return matched_cities[0]"
]
},
{
"cell_type": "code",
"execution_count": 512,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import re\n",
"\n",
"def extract_id_from_regional_id(regional_id):\n",
" match = re.match('.*?(\\d+$)', regional_id)\n",
" if not match:\n",
" return '0'\n",
" return match.group(1)"
]
},
{
"cell_type": "code",
"execution_count": 513,
"metadata": {
"collapsed": false,
"scrolled": false
},
"outputs": [],
"source": [
"# Unify Regional id\n",
"patients['unique_ids'] = patients['regional_id'].apply(\n",
" lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))\n",
"exams['unique_ids'] = exams['regional_id'].apply(\n",
" lambda id: extract_city_from_regional_id(id) + extract_id_from_regional_id(id))"
]
},
{
"cell_type": "code",
"execution_count": 514,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Extract city for each patient\n",
"patients['city'] = patients['regional_id'].apply(lambda x: extract_city_from_regional_id(x))"
]
},
{
"cell_type": "code",
"execution_count": 515,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "AttributeError",
"evalue": "(\"'Series' object has no attribute 'height'\", 'occurred at index 0')",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, func, axis, broadcast, raw, reduce, args, **kwds)\u001b[0m\n\u001b[1;32m 4150\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4151\u001b[0m \u001b[0mreduce\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4152\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_standard\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreduce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreduce\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4153\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4154\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_apply_broadcast\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_apply_standard\u001b[0;34m(self, func, axis, ignore_failures, reduce)\u001b[0m\n\u001b[1;32m 4246\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4247\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mi\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv\u001b[0m \u001b[0;32min\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mseries_gen\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 4248\u001b[0;31m \u001b[0mresults\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mi\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 4249\u001b[0m \u001b[0mkeys\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mappend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4250\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mException\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m<ipython-input-515-c06a44a88da5>\u001b[0m in \u001b[0;36m<lambda>\u001b[0;34m(row)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# Calculate BMI for each patient\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mpatients\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'bmi'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpatients\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mweight\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0;36m2\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mrow\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mheight\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0;36m0\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mbmi_buckets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m'wychudzenie'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'niedowaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'prawidłowe'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'nadwaga'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość I'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość II'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'otyłość III'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/Users/adek/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[0;34m(self, name)\u001b[0m\n\u001b[1;32m 2742\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2743\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2744\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2745\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2746\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mAttributeError\u001b[0m: (\"'Series' object has no attribute 'height'\", 'occurred at index 0')"
]
}
],
"source": [
"# Calculate BMI for each patient\n",
"patients['bmi'] = patients.apply(lambda row: row.weight / (row.height/100)**2 if row.height != 0 else 0, axis=1)\n",
"\n",
"bmi_buckets = ['wychudzenie', 'niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III']\n",
"\n",
"patients['bmi_bucket'] = pd.cut(patients['bmi'], bins=[0, 17, 18.5, 25, 30, 35, 40, 100], right=True, labels=bmi_buckets)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Filter incomplete data rows\n",
"patients = patients[~patients.unique_ids.str.contains('Unknown')]\n",
"exams = exams[~exams.unique_ids.str.contains('Unknown')]\n",
"exams = exams[~exams.regional_id.str.contains('test')]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Select columns which matter\n",
"patients = pd.DataFrame(patients, columns=['unique_ids', 'city', 'age', 'sex', 'smoker', 'bmi', 'bmi_bucket'])\n",
"exams = pd.DataFrame(exams, columns=['exam_name', 'result', 'unique_ids'])\n",
"\n",
"# Select unique patients\n",
"patients = patients.drop_duplicates(subset=['unique_ids'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Prepare Glucose Results data\n",
"glucose = pd.DataFrame(exams[exams.exam_name.str.contains('GlucoseLevel')])\n",
"glucose['glucose_level'] = glucose['result'].apply(lambda x: int(x))\n",
"glucose = pd.DataFrame(glucose, columns=['unique_ids', 'glucose_level', 'exam_name'])\n",
"\n",
"glucose_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelYes'])\n",
"glucose_less_than_2h_after_meal = pd.DataFrame(glucose[glucose.exam_name == 'GlucoseLevelNo'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Prepare Blood Pressure Results data\n",
"def extract_systolic(data):\n",
" return int(re.match('(\\d+)/\\d+', data).group(1))\n",
"def extract_diastolic(data):\n",
" return int(re.match('\\d+/(\\d+)', data).group(1))\n",
"\n",
"blood_pressure = pd.DataFrame(exams[exams.exam_name == 'BloodPressureTest'])\n",
"blood_pressure['systolic'] = blood_pressure['result'].apply(extract_systolic)\n",
"blood_pressure['diastolic'] = blood_pressure['result'].apply(extract_diastolic)\n",
"blood_pressure = pd.DataFrame(blood_pressure, columns=['unique_ids', 'systolic', 'diastolic'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Join datasets\n",
"patients_blood_pressure = pd.merge(blood_pressure, patients, on='unique_ids')\n",
"patients_glucose_2h_after_meal = pd.merge(glucose_2h_after_meal, patients, on='unique_ids')\n",
"patients_glucose_less_than_2h_after_meal = pd.merge(glucose_less_than_2h_after_meal, patients, on='unique_ids')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"total_number_of_patients = len(patients)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## W sumie przebadano"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"total_number_of_patients"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients.groupby('city').size()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"cf.set_config_file(world_readable=True,offline=True)\n",
"city_count = pd.DataFrame({'count': patients.groupby('city', as_index=False).size()}).reset_index()\n",
"city_count\n",
"city_count.iplot(kind='pie', labels='city', values='count', textinfo='value', sort=True, colorscale='blues')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## Ciśnienie zmierzono\n",
"len(patients_blood_pressure)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## Cukier 2h po jedzeniu zmierzono\n",
"len(patients_glucose_2h_after_meal)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"## Cukier mniej niż 2h po jedzeniu zmierzono\n",
"len(patients_glucose_less_than_2h_after_meal)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"sns.jointplot(\"diastolic\", \"systolic\", data=patients_blood_pressure, kind=\"scatter\", ylim=(80, 200), xlim=(30,140))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"## Smoker vs. Nonsmoker\n",
"%matplotlib inline\n",
"\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='smoker', size=5, markers='o', palette=\"PuOr\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_blood_pressure['age_bucket'] = pd.cut(patients_blood_pressure['age'], bins=[18, 29, 39, 49, 65, 100], right=False)\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'], hue='age_bucket', size=5, markers='o', palette=\"PuOr\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_glucose_2h_after_meal['age_bucket'] = \\\n",
" pd.cut(patients_glucose_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
"\n",
"# sugar_2h_after_rows.sort_values(by='glucose_yes', ascending=False)\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.15)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"patients_glucose_less_than_2h_after_meal['age_bucket'] = \\\n",
" pd.cut(patients_glucose_less_than_2h_after_meal['age'], bins=[0, 18, 29, 39, 49, 65, 100], right=False)\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.15)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 120, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax = ax)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker', split=True, ax=ax)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and hand pick labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.boxplot(x=\"age_bucket\", y=\"glucose_level\", data=patients_glucose_2h_after_meal, hue='city', fliersize=10, whis=0.9)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"## Ideas:\n",
"# Wrocław vs. Szczecin"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"## BMI\n",
"f, ax = plt.subplots(figsize=(9, 9))\n",
"\n",
"# Log scale and manually picked labels\n",
"ax.set(yscale='log')\n",
"\n",
"y=[30, 50, 80, 100, 140, 200, 300, 400, 500]\n",
"plt.semilogy(y, y)\n",
"plt.yticks(y, y)\n",
"\n",
"# Horizontal line for 200\n",
"plt.plot([-10, 10], [200, 200], '--', color=\"red\")\n",
"\n",
"sns.stripplot(x=\"bmi_bucket\", y=\"glucose_level\", \n",
" data=patients_glucose_less_than_2h_after_meal, jitter=0.25, hue='smoker',\n",
" split=True, ax=ax,\n",
" order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"colors = ['blue', 'green', 'yellow', 'orange', 'red', 'dark red']\n",
"# colors = ['blue', 'green', 'red', 'red', 'red', 'red']\n",
"\n",
"sns.pairplot(patients_blood_pressure, x_vars=['diastolic'], y_vars=['systolic'],\n",
" hue='bmi_bucket',\n",
" size=8, markers='o', palette=sns.xkcd_palette(colors),\n",
" hue_order=['niedowaga', 'prawidłowe', 'nadwaga', 'otyłość I', 'otyłość II', 'otyłość III'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment