Skip to content

Instantly share code, notes, and snippets.

@BaekKyunShin
Created June 25, 2019 14:36
Show Gist options
  • Save BaekKyunShin/9edcbb31bf26f4ab0c40a9b3499ffb31 to your computer and use it in GitHub Desktop.
Save BaekKyunShin/9edcbb31bf26f4ab0c40a9b3499ffb31 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Gather"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"patients = pd.read_csv('patients.csv')\n",
"treatments = pd.read_csv('treatments.csv')\n",
"adverse_reactions = pd.read_csv('adverse_reactions.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Assess"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>contact</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>Zoe</td>\n",
" <td>Wellish</td>\n",
" <td>576 Brown Bear Drive</td>\n",
" <td>Rancho California</td>\n",
" <td>California</td>\n",
" <td>92390.0</td>\n",
" <td>United States</td>\n",
" <td>951-719-9170ZoeWellish@superrito.com</td>\n",
" <td>7/10/1976</td>\n",
" <td>121.7</td>\n",
" <td>66</td>\n",
" <td>19.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>female</td>\n",
" <td>Pamela</td>\n",
" <td>Hill</td>\n",
" <td>2370 University Hill Road</td>\n",
" <td>Armstrong</td>\n",
" <td>Illinois</td>\n",
" <td>61812.0</td>\n",
" <td>United States</td>\n",
" <td>PamelaSHill@cuvox.de+1 (217) 569-3204</td>\n",
" <td>4/3/1967</td>\n",
" <td>118.8</td>\n",
" <td>66</td>\n",
" <td>19.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>Jae</td>\n",
" <td>Debord</td>\n",
" <td>1493 Poling Farm Road</td>\n",
" <td>York</td>\n",
" <td>Nebraska</td>\n",
" <td>68467.0</td>\n",
" <td>United States</td>\n",
" <td>402-363-6804JaeMDebord@gustr.com</td>\n",
" <td>2/19/1980</td>\n",
" <td>177.8</td>\n",
" <td>71</td>\n",
" <td>24.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4</td>\n",
" <td>male</td>\n",
" <td>Liêm</td>\n",
" <td>Phan</td>\n",
" <td>2335 Webster Street</td>\n",
" <td>Woodbridge</td>\n",
" <td>NJ</td>\n",
" <td>7095.0</td>\n",
" <td>United States</td>\n",
" <td>PhanBaLiem@jourrapide.com+1 (732) 636-8246</td>\n",
" <td>7/26/1951</td>\n",
" <td>220.9</td>\n",
" <td>70</td>\n",
" <td>31.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>male</td>\n",
" <td>Tim</td>\n",
" <td>Neudorf</td>\n",
" <td>1428 Turkey Pen Lane</td>\n",
" <td>Dothan</td>\n",
" <td>AL</td>\n",
" <td>36303.0</td>\n",
" <td>United States</td>\n",
" <td>334-515-7487TimNeudorf@cuvox.de</td>\n",
" <td>2/18/1928</td>\n",
" <td>192.3</td>\n",
" <td>27</td>\n",
" <td>26.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>6</td>\n",
" <td>male</td>\n",
" <td>Rafael</td>\n",
" <td>Costa</td>\n",
" <td>1140 Willis Avenue</td>\n",
" <td>Daytona Beach</td>\n",
" <td>Florida</td>\n",
" <td>32114.0</td>\n",
" <td>United States</td>\n",
" <td>386-334-5237RafaelCardosoCosta@gustr.com</td>\n",
" <td>8/31/1931</td>\n",
" <td>183.9</td>\n",
" <td>70</td>\n",
" <td>26.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7</td>\n",
" <td>female</td>\n",
" <td>Mary</td>\n",
" <td>Adams</td>\n",
" <td>3145 Sheila Lane</td>\n",
" <td>Burbank</td>\n",
" <td>NV</td>\n",
" <td>84728.0</td>\n",
" <td>United States</td>\n",
" <td>775-533-5933MaryBAdams@einrot.com</td>\n",
" <td>11/19/1969</td>\n",
" <td>146.3</td>\n",
" <td>65</td>\n",
" <td>24.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>8</td>\n",
" <td>female</td>\n",
" <td>Xiuxiu</td>\n",
" <td>Chang</td>\n",
" <td>2687 Black Oak Hollow Road</td>\n",
" <td>Morgan Hill</td>\n",
" <td>CA</td>\n",
" <td>95037.0</td>\n",
" <td>United States</td>\n",
" <td>XiuxiuChang@einrot.com1 408 778 3236</td>\n",
" <td>8/13/1958</td>\n",
" <td>158.0</td>\n",
" <td>60</td>\n",
" <td>30.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>male</td>\n",
" <td>Dsvid</td>\n",
" <td>Gustafsson</td>\n",
" <td>1790 Nutter Street</td>\n",
" <td>Kansas City</td>\n",
" <td>MO</td>\n",
" <td>64105.0</td>\n",
" <td>United States</td>\n",
" <td>816-265-9578DavidGustafsson@armyspy.com</td>\n",
" <td>3/6/1937</td>\n",
" <td>163.9</td>\n",
" <td>66</td>\n",
" <td>26.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>10</td>\n",
" <td>female</td>\n",
" <td>Sophie</td>\n",
" <td>Cabrera</td>\n",
" <td>3303 Anmoore Road</td>\n",
" <td>New York</td>\n",
" <td>New York</td>\n",
" <td>10011.0</td>\n",
" <td>United States</td>\n",
" <td>SophieCabreraIbarra@teleworm.us1 718 795 9124</td>\n",
" <td>12/3/1930</td>\n",
" <td>194.7</td>\n",
" <td>64</td>\n",
" <td>33.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>11</td>\n",
" <td>female</td>\n",
" <td>Sandy</td>\n",
" <td>Gunnarsson</td>\n",
" <td>87 Wood Duck Drive</td>\n",
" <td>Rudyard</td>\n",
" <td>MI</td>\n",
" <td>49780.0</td>\n",
" <td>United States</td>\n",
" <td>906-478-8949SandyGunnarsson@dayrep.com</td>\n",
" <td>7/16/1974</td>\n",
" <td>199.3</td>\n",
" <td>62</td>\n",
" <td>36.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>12</td>\n",
" <td>male</td>\n",
" <td>Abdul-Nur</td>\n",
" <td>Isa</td>\n",
" <td>1092 Farm Meadow Drive</td>\n",
" <td>Brentwood</td>\n",
" <td>TN</td>\n",
" <td>37027.0</td>\n",
" <td>United States</td>\n",
" <td>Abdul-NurMummarIsa@rhyta.com1 931 207 0839</td>\n",
" <td>2/3/1954</td>\n",
" <td>238.7</td>\n",
" <td>73</td>\n",
" <td>31.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>13</td>\n",
" <td>male</td>\n",
" <td>Omeokachie</td>\n",
" <td>Ibeamaka</td>\n",
" <td>2544 Worley Avenue</td>\n",
" <td>Lynchburg</td>\n",
" <td>VA</td>\n",
" <td>24504.0</td>\n",
" <td>United States</td>\n",
" <td>OmeokachieIbeamaka@einrot.com434-509-2614</td>\n",
" <td>8/5/1957</td>\n",
" <td>224.2</td>\n",
" <td>69</td>\n",
" <td>33.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>14</td>\n",
" <td>female</td>\n",
" <td>Anenechi</td>\n",
" <td>Chidi</td>\n",
" <td>826 Broad Street</td>\n",
" <td>Birmingham</td>\n",
" <td>AL</td>\n",
" <td>35203.0</td>\n",
" <td>United States</td>\n",
" <td>AnenechiChidi@armyspy.com+1 (205) 417-8095</td>\n",
" <td>3/7/1961</td>\n",
" <td>228.4</td>\n",
" <td>67</td>\n",
" <td>35.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>15</td>\n",
" <td>female</td>\n",
" <td>Asia</td>\n",
" <td>Woźniak</td>\n",
" <td>4970 Heather Sees Way</td>\n",
" <td>Tulsa</td>\n",
" <td>OK</td>\n",
" <td>74105.0</td>\n",
" <td>United States</td>\n",
" <td>AsiaWozniak@rhyta.com918-712-3469</td>\n",
" <td>8/15/1997</td>\n",
" <td>112.0</td>\n",
" <td>65</td>\n",
" <td>18.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>16</td>\n",
" <td>male</td>\n",
" <td>Søren</td>\n",
" <td>Lund</td>\n",
" <td>2438 Shady Pines Drive</td>\n",
" <td>Kingsport</td>\n",
" <td>VA</td>\n",
" <td>37660.0</td>\n",
" <td>United States</td>\n",
" <td>276-225-1955SrenFLund@gustr.com</td>\n",
" <td>8/23/1922</td>\n",
" <td>201.5</td>\n",
" <td>64</td>\n",
" <td>34.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>17</td>\n",
" <td>female</td>\n",
" <td>Tám</td>\n",
" <td>Liễu</td>\n",
" <td>2152 Heritage Road</td>\n",
" <td>Fresno</td>\n",
" <td>California</td>\n",
" <td>93706.0</td>\n",
" <td>United States</td>\n",
" <td>LieuThiThuTam@dayrep.com1 559 765 7836</td>\n",
" <td>11/14/1952</td>\n",
" <td>183.9</td>\n",
" <td>61</td>\n",
" <td>34.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>18</td>\n",
" <td>female</td>\n",
" <td>Roxanne</td>\n",
" <td>Andreyeva</td>\n",
" <td>2103 Edington Drive</td>\n",
" <td>Smyrna</td>\n",
" <td>GA</td>\n",
" <td>30082.0</td>\n",
" <td>United States</td>\n",
" <td>RoxanneAndreyeva@armyspy.com678-829-8578</td>\n",
" <td>7/24/1922</td>\n",
" <td>129.1</td>\n",
" <td>60</td>\n",
" <td>25.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>19</td>\n",
" <td>male</td>\n",
" <td>William</td>\n",
" <td>Oates</td>\n",
" <td>441 Tibbs Avenue</td>\n",
" <td>Ekalaka</td>\n",
" <td>MT</td>\n",
" <td>59324.0</td>\n",
" <td>United States</td>\n",
" <td>406-775-2696WilliamVOates@armyspy.com</td>\n",
" <td>9/4/1949</td>\n",
" <td>202.2</td>\n",
" <td>64</td>\n",
" <td>34.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>20</td>\n",
" <td>male</td>\n",
" <td>Zak</td>\n",
" <td>Kelly</td>\n",
" <td>994 Hill Croft Farm Road</td>\n",
" <td>Oroville</td>\n",
" <td>California</td>\n",
" <td>95966.0</td>\n",
" <td>United States</td>\n",
" <td>ZakKelly@rhyta.com1 530 532 8397</td>\n",
" <td>12/13/1988</td>\n",
" <td>208.8</td>\n",
" <td>70</td>\n",
" <td>30.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>21</td>\n",
" <td>female</td>\n",
" <td>Sofia</td>\n",
" <td>Karlsen</td>\n",
" <td>2931 Romano Street</td>\n",
" <td>Whitman</td>\n",
" <td>MA</td>\n",
" <td>2382.0</td>\n",
" <td>United States</td>\n",
" <td>SofiaTKarlsen@teleworm.us1 781 447 1763</td>\n",
" <td>9/24/1934</td>\n",
" <td>153.1</td>\n",
" <td>66</td>\n",
" <td>24.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>22</td>\n",
" <td>male</td>\n",
" <td>Samúel</td>\n",
" <td>Guðbrandsson</td>\n",
" <td>1904 Granville Lane</td>\n",
" <td>Elmsford</td>\n",
" <td>NJ</td>\n",
" <td>10523.0</td>\n",
" <td>United States</td>\n",
" <td>973-445-5341SamuelGubrandsson@teleworm.us</td>\n",
" <td>4/12/1983</td>\n",
" <td>223.7</td>\n",
" <td>69</td>\n",
" <td>33.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>23</td>\n",
" <td>male</td>\n",
" <td>Manchu</td>\n",
" <td>Su</td>\n",
" <td>1092 Deans Lane</td>\n",
" <td>Pleasantville</td>\n",
" <td>NY</td>\n",
" <td>10570.0</td>\n",
" <td>United States</td>\n",
" <td>914-745-6108ManchuSu@einrot.com</td>\n",
" <td>1/19/1936</td>\n",
" <td>130.7</td>\n",
" <td>65</td>\n",
" <td>21.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>24</td>\n",
" <td>male</td>\n",
" <td>Lovre</td>\n",
" <td>Galić</td>\n",
" <td>4941 Marion Drive</td>\n",
" <td>Winter Haven</td>\n",
" <td>Florida</td>\n",
" <td>33830.0</td>\n",
" <td>United States</td>\n",
" <td>LovreGalic@gustr.com1 813 355 9476</td>\n",
" <td>5/26/1960</td>\n",
" <td>222.9</td>\n",
" <td>66</td>\n",
" <td>36.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>25</td>\n",
" <td>male</td>\n",
" <td>Jakob</td>\n",
" <td>Jakobsen</td>\n",
" <td>648 Old Dear Lane</td>\n",
" <td>Port Jervis</td>\n",
" <td>New York</td>\n",
" <td>12771.0</td>\n",
" <td>United States</td>\n",
" <td>JakobCJakobsen@einrot.com+1 (845) 858-7707</td>\n",
" <td>8/1/1985</td>\n",
" <td>155.8</td>\n",
" <td>67</td>\n",
" <td>24.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>26</td>\n",
" <td>male</td>\n",
" <td>Gregor</td>\n",
" <td>Bole</td>\n",
" <td>922 Chapmans Lane</td>\n",
" <td>Albuquerque</td>\n",
" <td>NM</td>\n",
" <td>87109.0</td>\n",
" <td>United States</td>\n",
" <td>GregorBole@gustr.com505-828-4955</td>\n",
" <td>6/19/1922</td>\n",
" <td>180.8</td>\n",
" <td>67</td>\n",
" <td>28.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>27</td>\n",
" <td>female</td>\n",
" <td>Ella</td>\n",
" <td>Lund</td>\n",
" <td>1207 Garfield Road</td>\n",
" <td>Peoria</td>\n",
" <td>IL</td>\n",
" <td>61602.0</td>\n",
" <td>United States</td>\n",
" <td>309-671-8852EllaLund@armyspy.com</td>\n",
" <td>12/19/1933</td>\n",
" <td>144.8</td>\n",
" <td>61</td>\n",
" <td>27.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>28</td>\n",
" <td>male</td>\n",
" <td>Joseph</td>\n",
" <td>Tucker</td>\n",
" <td>4982 Wood Street</td>\n",
" <td>Venice</td>\n",
" <td>LA</td>\n",
" <td>70091.0</td>\n",
" <td>United States</td>\n",
" <td>985-814-7603JosephNTucker@rhyta.com</td>\n",
" <td>4/10/1959</td>\n",
" <td>175.8</td>\n",
" <td>72</td>\n",
" <td>23.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>29</td>\n",
" <td>male</td>\n",
" <td>Robert</td>\n",
" <td>Wolf</td>\n",
" <td>2386 Linda Street</td>\n",
" <td>Fort Washington</td>\n",
" <td>PA</td>\n",
" <td>19034.0</td>\n",
" <td>United States</td>\n",
" <td>RobertWolf@fleckens.hu1 267 895 7462</td>\n",
" <td>6/26/1937</td>\n",
" <td>206.6</td>\n",
" <td>70</td>\n",
" <td>29.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>30</td>\n",
" <td>male</td>\n",
" <td>Jake</td>\n",
" <td>Jakobsen</td>\n",
" <td>648 Old Dear Lane</td>\n",
" <td>Port Jervis</td>\n",
" <td>New York</td>\n",
" <td>12771.0</td>\n",
" <td>United States</td>\n",
" <td>JakobCJakobsen@einrot.com+1 (845) 858-7707</td>\n",
" <td>8/1/1985</td>\n",
" <td>155.8</td>\n",
" <td>67</td>\n",
" <td>24.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>473</th>\n",
" <td>474</td>\n",
" <td>female</td>\n",
" <td>Kate</td>\n",
" <td>Wilkinson</td>\n",
" <td>664 Lyon Avenue</td>\n",
" <td>South Boston</td>\n",
" <td>MA</td>\n",
" <td>2127.0</td>\n",
" <td>United States</td>\n",
" <td>KateWilkinson@armyspy.com1 508 905 2371</td>\n",
" <td>7/18/1998</td>\n",
" <td>175.3</td>\n",
" <td>65</td>\n",
" <td>29.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>474</th>\n",
" <td>475</td>\n",
" <td>female</td>\n",
" <td>Esperanza</td>\n",
" <td>Labrosse</td>\n",
" <td>1370 Flint Street</td>\n",
" <td>Atlanta</td>\n",
" <td>GA</td>\n",
" <td>30303.0</td>\n",
" <td>United States</td>\n",
" <td>EsperanzaLabrosse@armyspy.com678-263-3564</td>\n",
" <td>10/7/1961</td>\n",
" <td>181.5</td>\n",
" <td>63</td>\n",
" <td>32.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>475</th>\n",
" <td>476</td>\n",
" <td>male</td>\n",
" <td>Malik</td>\n",
" <td>Vaneker</td>\n",
" <td>1270 Haul Road</td>\n",
" <td>Mountain View</td>\n",
" <td>California</td>\n",
" <td>94041.0</td>\n",
" <td>United States</td>\n",
" <td>MalikVaneker@superrito.com650-962-7179</td>\n",
" <td>9/25/1953</td>\n",
" <td>214.4</td>\n",
" <td>67</td>\n",
" <td>33.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>476</th>\n",
" <td>477</td>\n",
" <td>female</td>\n",
" <td>Berta</td>\n",
" <td>Napolitani</td>\n",
" <td>1815 Garrett Street</td>\n",
" <td>Philadelphia</td>\n",
" <td>PA</td>\n",
" <td>19108.0</td>\n",
" <td>United States</td>\n",
" <td>267-972-3749BertaNapolitani@rhyta.com</td>\n",
" <td>12/2/1958</td>\n",
" <td>153.3</td>\n",
" <td>63</td>\n",
" <td>27.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>477</th>\n",
" <td>478</td>\n",
" <td>male</td>\n",
" <td>Juliusz</td>\n",
" <td>Majewski</td>\n",
" <td>4435 Poe Road</td>\n",
" <td>Florence</td>\n",
" <td>SC</td>\n",
" <td>29501.0</td>\n",
" <td>United States</td>\n",
" <td>JuliuszMajewski@superrito.com+1 (843) 212-6421</td>\n",
" <td>9/29/1966</td>\n",
" <td>212.1</td>\n",
" <td>69</td>\n",
" <td>31.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>478</th>\n",
" <td>479</td>\n",
" <td>female</td>\n",
" <td>Edelma</td>\n",
" <td>Villalpando</td>\n",
" <td>312 Jim Rosa Lane</td>\n",
" <td>San Jose</td>\n",
" <td>CA</td>\n",
" <td>95134.0</td>\n",
" <td>United States</td>\n",
" <td>EdelmaVillalpandoSantillan@teleworm.us+1 (415)...</td>\n",
" <td>6/24/1977</td>\n",
" <td>109.6</td>\n",
" <td>63</td>\n",
" <td>19.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>479</th>\n",
" <td>480</td>\n",
" <td>male</td>\n",
" <td>Tapa</td>\n",
" <td>Arsanukayev</td>\n",
" <td>4720 Gordon Street</td>\n",
" <td>Ontario</td>\n",
" <td>California</td>\n",
" <td>91762.0</td>\n",
" <td>United States</td>\n",
" <td>TapaArsanukayev@dayrep.com1 909 458 2515</td>\n",
" <td>9/15/1955</td>\n",
" <td>220.0</td>\n",
" <td>65</td>\n",
" <td>36.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>480</th>\n",
" <td>481</td>\n",
" <td>male</td>\n",
" <td>Nasser</td>\n",
" <td>Mansour</td>\n",
" <td>547 Weekley Street</td>\n",
" <td>San Antonio</td>\n",
" <td>TX</td>\n",
" <td>78212.0</td>\n",
" <td>United States</td>\n",
" <td>NasserMazinMansour@fleckens.hu1 210 326 5509</td>\n",
" <td>3/25/1938</td>\n",
" <td>183.5</td>\n",
" <td>66</td>\n",
" <td>29.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>481</th>\n",
" <td>482</td>\n",
" <td>male</td>\n",
" <td>Michael</td>\n",
" <td>Kristensen</td>\n",
" <td>1614 Heather Sees Way</td>\n",
" <td>Tulsa</td>\n",
" <td>OK</td>\n",
" <td>74116.0</td>\n",
" <td>United States</td>\n",
" <td>MichaelKristensen@gustr.com1 918 706 2776</td>\n",
" <td>8/10/1930</td>\n",
" <td>154.7</td>\n",
" <td>65</td>\n",
" <td>25.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>482</th>\n",
" <td>483</td>\n",
" <td>male</td>\n",
" <td>Diogo</td>\n",
" <td>Souza</td>\n",
" <td>4033 White Avenue</td>\n",
" <td>Corpus Christi</td>\n",
" <td>TX</td>\n",
" <td>78401.0</td>\n",
" <td>United States</td>\n",
" <td>361-693-4960DiogoBarrosSouza@jourrapide.com</td>\n",
" <td>3/3/1945</td>\n",
" <td>220.0</td>\n",
" <td>65</td>\n",
" <td>36.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>483</th>\n",
" <td>484</td>\n",
" <td>female</td>\n",
" <td>Angel</td>\n",
" <td>Grant</td>\n",
" <td>990 Melville Street</td>\n",
" <td>Memphis</td>\n",
" <td>TN</td>\n",
" <td>38118.0</td>\n",
" <td>United States</td>\n",
" <td>731-577-0292AngelGrant@fleckens.hu</td>\n",
" <td>8/14/1987</td>\n",
" <td>123.9</td>\n",
" <td>61</td>\n",
" <td>23.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>484</th>\n",
" <td>485</td>\n",
" <td>male</td>\n",
" <td>Placido</td>\n",
" <td>Udinesi</td>\n",
" <td>1094 Jones Avenue</td>\n",
" <td>Greensboro</td>\n",
" <td>NC</td>\n",
" <td>28716.0</td>\n",
" <td>United States</td>\n",
" <td>336-697-2005PlacidoUdinesi@dayrep.com</td>\n",
" <td>5/31/1934</td>\n",
" <td>175.8</td>\n",
" <td>65</td>\n",
" <td>29.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>485</th>\n",
" <td>486</td>\n",
" <td>male</td>\n",
" <td>Trifon</td>\n",
" <td>Izmailov</td>\n",
" <td>3697 Drainer Avenue</td>\n",
" <td>Fort Walton Beach</td>\n",
" <td>FL</td>\n",
" <td>32548.0</td>\n",
" <td>United States</td>\n",
" <td>TrifonIzmailov@fleckens.hu1 850 659 0417</td>\n",
" <td>2/15/1973</td>\n",
" <td>255.9</td>\n",
" <td>74</td>\n",
" <td>32.9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>486</th>\n",
" <td>487</td>\n",
" <td>male</td>\n",
" <td>Samuel</td>\n",
" <td>Blix</td>\n",
" <td>3488 Clair Street</td>\n",
" <td>Waco</td>\n",
" <td>TX</td>\n",
" <td>76706.0</td>\n",
" <td>United States</td>\n",
" <td>254-681-4504SamuelBlix@dayrep.com</td>\n",
" <td>7/6/1983</td>\n",
" <td>211.4</td>\n",
" <td>74</td>\n",
" <td>27.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>487</th>\n",
" <td>488</td>\n",
" <td>male</td>\n",
" <td>Ivar</td>\n",
" <td>Löfgren</td>\n",
" <td>1346 Nicholas Street</td>\n",
" <td>Ottawa</td>\n",
" <td>KS</td>\n",
" <td>66067.0</td>\n",
" <td>United States</td>\n",
" <td>IvarLofgren@armyspy.com1 785 229 1188</td>\n",
" <td>11/7/1962</td>\n",
" <td>242.4</td>\n",
" <td>77</td>\n",
" <td>28.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>488</th>\n",
" <td>489</td>\n",
" <td>male</td>\n",
" <td>Mika</td>\n",
" <td>Martinsson</td>\n",
" <td>962 George Street</td>\n",
" <td>Ocala</td>\n",
" <td>Florida</td>\n",
" <td>34471.0</td>\n",
" <td>United States</td>\n",
" <td>352-453-4601MikaMartinsson@armyspy.com</td>\n",
" <td>1/27/1970</td>\n",
" <td>165.0</td>\n",
" <td>67</td>\n",
" <td>25.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>489</th>\n",
" <td>490</td>\n",
" <td>female</td>\n",
" <td>Jasmine</td>\n",
" <td>Sykes</td>\n",
" <td>2607 Water Street</td>\n",
" <td>Lafayette</td>\n",
" <td>California</td>\n",
" <td>94549.0</td>\n",
" <td>United States</td>\n",
" <td>JasmineSykes@jourrapide.com925-283-5425</td>\n",
" <td>12/1/1988</td>\n",
" <td>187.2</td>\n",
" <td>63</td>\n",
" <td>33.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>490</th>\n",
" <td>491</td>\n",
" <td>male</td>\n",
" <td>Jackson</td>\n",
" <td>Addison</td>\n",
" <td>1160 Taylor Street</td>\n",
" <td>New Rochelle</td>\n",
" <td>New York</td>\n",
" <td>10801.0</td>\n",
" <td>United States</td>\n",
" <td>914-636-9304JacksonAddison@armyspy.com</td>\n",
" <td>5/29/1953</td>\n",
" <td>192.7</td>\n",
" <td>69</td>\n",
" <td>28.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>491</th>\n",
" <td>492</td>\n",
" <td>female</td>\n",
" <td>Vanessa</td>\n",
" <td>Ferguson</td>\n",
" <td>241 Freshour Circle</td>\n",
" <td>San Antonio</td>\n",
" <td>TX</td>\n",
" <td>78205.0</td>\n",
" <td>United States</td>\n",
" <td>210-222-8684VanessaFerguson@jourrapide.com</td>\n",
" <td>9/21/1950</td>\n",
" <td>149.8</td>\n",
" <td>67</td>\n",
" <td>23.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>492</th>\n",
" <td>493</td>\n",
" <td>male</td>\n",
" <td>Poldi</td>\n",
" <td>Tar</td>\n",
" <td>3958 Liberty Avenue</td>\n",
" <td>Burbank</td>\n",
" <td>California</td>\n",
" <td>91505.0</td>\n",
" <td>United States</td>\n",
" <td>714-496-2264TarPoldi@superrito.com</td>\n",
" <td>5/23/1970</td>\n",
" <td>184.6</td>\n",
" <td>70</td>\n",
" <td>26.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>493</th>\n",
" <td>494</td>\n",
" <td>female</td>\n",
" <td>Fen</td>\n",
" <td>Chin</td>\n",
" <td>1826 Poplar Chase Lane</td>\n",
" <td>Boise</td>\n",
" <td>ID</td>\n",
" <td>83702.0</td>\n",
" <td>United States</td>\n",
" <td>FenChin@gustr.com+1 (208) 388-1065</td>\n",
" <td>3/18/1997</td>\n",
" <td>195.1</td>\n",
" <td>68</td>\n",
" <td>29.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>494</th>\n",
" <td>495</td>\n",
" <td>female</td>\n",
" <td>Sirkka</td>\n",
" <td>Piirainen</td>\n",
" <td>4102 Ritter Avenue</td>\n",
" <td>Roseville</td>\n",
" <td>MI</td>\n",
" <td>48066.0</td>\n",
" <td>United States</td>\n",
" <td>SirkkaPiirainen@teleworm.us+1 (586) 790-0975</td>\n",
" <td>1/16/1942</td>\n",
" <td>126.3</td>\n",
" <td>67</td>\n",
" <td>19.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>495</th>\n",
" <td>496</td>\n",
" <td>male</td>\n",
" <td>Hajime</td>\n",
" <td>Tsukada</td>\n",
" <td>4111 Thunder Road</td>\n",
" <td>San Mateo</td>\n",
" <td>CA</td>\n",
" <td>94403.0</td>\n",
" <td>United States</td>\n",
" <td>650-570-4896HajimeTsukada@dayrep.com</td>\n",
" <td>9/5/1972</td>\n",
" <td>168.1</td>\n",
" <td>66</td>\n",
" <td>27.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>496</th>\n",
" <td>497</td>\n",
" <td>male</td>\n",
" <td>Alexander</td>\n",
" <td>Hueber</td>\n",
" <td>3868 Freed Drive</td>\n",
" <td>Stockton</td>\n",
" <td>California</td>\n",
" <td>95204.0</td>\n",
" <td>United States</td>\n",
" <td>AlexanderHueber@jourrapide.com1 209 762 2320</td>\n",
" <td>9/12/1942</td>\n",
" <td>194.0</td>\n",
" <td>72</td>\n",
" <td>26.3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>497</th>\n",
" <td>498</td>\n",
" <td>male</td>\n",
" <td>Masataka</td>\n",
" <td>Murakami</td>\n",
" <td>1179 Patton Lane</td>\n",
" <td>Tulsa</td>\n",
" <td>OK</td>\n",
" <td>74116.0</td>\n",
" <td>United States</td>\n",
" <td>MasatakaMurakami@einrot.com+1 (918) 984-9171</td>\n",
" <td>8/19/1937</td>\n",
" <td>155.1</td>\n",
" <td>72</td>\n",
" <td>21.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>498</th>\n",
" <td>499</td>\n",
" <td>male</td>\n",
" <td>Mustafa</td>\n",
" <td>Lindström</td>\n",
" <td>2530 Victoria Court</td>\n",
" <td>Milton Mills</td>\n",
" <td>ME</td>\n",
" <td>3852.0</td>\n",
" <td>United States</td>\n",
" <td>207-477-0579MustafaLindstrom@jourrapide.com</td>\n",
" <td>4/10/1959</td>\n",
" <td>181.1</td>\n",
" <td>72</td>\n",
" <td>24.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>499</th>\n",
" <td>500</td>\n",
" <td>male</td>\n",
" <td>Ruman</td>\n",
" <td>Bisliev</td>\n",
" <td>494 Clarksburg Park Road</td>\n",
" <td>Sedona</td>\n",
" <td>AZ</td>\n",
" <td>86341.0</td>\n",
" <td>United States</td>\n",
" <td>928-284-4492RumanBisliev@gustr.com</td>\n",
" <td>3/26/1948</td>\n",
" <td>239.6</td>\n",
" <td>70</td>\n",
" <td>34.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500</th>\n",
" <td>501</td>\n",
" <td>female</td>\n",
" <td>Jinke</td>\n",
" <td>de Keizer</td>\n",
" <td>649 Nutter Street</td>\n",
" <td>Overland Park</td>\n",
" <td>MO</td>\n",
" <td>64110.0</td>\n",
" <td>United States</td>\n",
" <td>816-223-6007JinkedeKeizer@teleworm.us</td>\n",
" <td>1/13/1971</td>\n",
" <td>171.2</td>\n",
" <td>67</td>\n",
" <td>26.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>501</th>\n",
" <td>502</td>\n",
" <td>female</td>\n",
" <td>Chidalu</td>\n",
" <td>Onyekaozulu</td>\n",
" <td>3652 Boone Crockett Lane</td>\n",
" <td>Seattle</td>\n",
" <td>WA</td>\n",
" <td>98109.0</td>\n",
" <td>United States</td>\n",
" <td>ChidaluOnyekaozulu@jourrapide.com1 360 443 2060</td>\n",
" <td>2/13/1952</td>\n",
" <td>176.9</td>\n",
" <td>67</td>\n",
" <td>27.7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>502</th>\n",
" <td>503</td>\n",
" <td>male</td>\n",
" <td>Pat</td>\n",
" <td>Gersten</td>\n",
" <td>2778 North Avenue</td>\n",
" <td>Burr</td>\n",
" <td>Nebraska</td>\n",
" <td>68324.0</td>\n",
" <td>United States</td>\n",
" <td>PatrickGersten@rhyta.com402-848-4923</td>\n",
" <td>5/3/1954</td>\n",
" <td>138.2</td>\n",
" <td>71</td>\n",
" <td>19.3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>503 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname \\\n",
"0 1 female Zoe Wellish \n",
"1 2 female Pamela Hill \n",
"2 3 male Jae Debord \n",
"3 4 male Liêm Phan \n",
"4 5 male Tim Neudorf \n",
"5 6 male Rafael Costa \n",
"6 7 female Mary Adams \n",
"7 8 female Xiuxiu Chang \n",
"8 9 male Dsvid Gustafsson \n",
"9 10 female Sophie Cabrera \n",
"10 11 female Sandy Gunnarsson \n",
"11 12 male Abdul-Nur Isa \n",
"12 13 male Omeokachie Ibeamaka \n",
"13 14 female Anenechi Chidi \n",
"14 15 female Asia Woźniak \n",
"15 16 male Søren Lund \n",
"16 17 female Tám Liễu \n",
"17 18 female Roxanne Andreyeva \n",
"18 19 male William Oates \n",
"19 20 male Zak Kelly \n",
"20 21 female Sofia Karlsen \n",
"21 22 male Samúel Guðbrandsson \n",
"22 23 male Manchu Su \n",
"23 24 male Lovre Galić \n",
"24 25 male Jakob Jakobsen \n",
"25 26 male Gregor Bole \n",
"26 27 female Ella Lund \n",
"27 28 male Joseph Tucker \n",
"28 29 male Robert Wolf \n",
"29 30 male Jake Jakobsen \n",
".. ... ... ... ... \n",
"473 474 female Kate Wilkinson \n",
"474 475 female Esperanza Labrosse \n",
"475 476 male Malik Vaneker \n",
"476 477 female Berta Napolitani \n",
"477 478 male Juliusz Majewski \n",
"478 479 female Edelma Villalpando \n",
"479 480 male Tapa Arsanukayev \n",
"480 481 male Nasser Mansour \n",
"481 482 male Michael Kristensen \n",
"482 483 male Diogo Souza \n",
"483 484 female Angel Grant \n",
"484 485 male Placido Udinesi \n",
"485 486 male Trifon Izmailov \n",
"486 487 male Samuel Blix \n",
"487 488 male Ivar Löfgren \n",
"488 489 male Mika Martinsson \n",
"489 490 female Jasmine Sykes \n",
"490 491 male Jackson Addison \n",
"491 492 female Vanessa Ferguson \n",
"492 493 male Poldi Tar \n",
"493 494 female Fen Chin \n",
"494 495 female Sirkka Piirainen \n",
"495 496 male Hajime Tsukada \n",
"496 497 male Alexander Hueber \n",
"497 498 male Masataka Murakami \n",
"498 499 male Mustafa Lindström \n",
"499 500 male Ruman Bisliev \n",
"500 501 female Jinke de Keizer \n",
"501 502 female Chidalu Onyekaozulu \n",
"502 503 male Pat Gersten \n",
"\n",
" address city state zip_code \\\n",
"0 576 Brown Bear Drive Rancho California California 92390.0 \n",
"1 2370 University Hill Road Armstrong Illinois 61812.0 \n",
"2 1493 Poling Farm Road York Nebraska 68467.0 \n",
"3 2335 Webster Street Woodbridge NJ 7095.0 \n",
"4 1428 Turkey Pen Lane Dothan AL 36303.0 \n",
"5 1140 Willis Avenue Daytona Beach Florida 32114.0 \n",
"6 3145 Sheila Lane Burbank NV 84728.0 \n",
"7 2687 Black Oak Hollow Road Morgan Hill CA 95037.0 \n",
"8 1790 Nutter Street Kansas City MO 64105.0 \n",
"9 3303 Anmoore Road New York New York 10011.0 \n",
"10 87 Wood Duck Drive Rudyard MI 49780.0 \n",
"11 1092 Farm Meadow Drive Brentwood TN 37027.0 \n",
"12 2544 Worley Avenue Lynchburg VA 24504.0 \n",
"13 826 Broad Street Birmingham AL 35203.0 \n",
"14 4970 Heather Sees Way Tulsa OK 74105.0 \n",
"15 2438 Shady Pines Drive Kingsport VA 37660.0 \n",
"16 2152 Heritage Road Fresno California 93706.0 \n",
"17 2103 Edington Drive Smyrna GA 30082.0 \n",
"18 441 Tibbs Avenue Ekalaka MT 59324.0 \n",
"19 994 Hill Croft Farm Road Oroville California 95966.0 \n",
"20 2931 Romano Street Whitman MA 2382.0 \n",
"21 1904 Granville Lane Elmsford NJ 10523.0 \n",
"22 1092 Deans Lane Pleasantville NY 10570.0 \n",
"23 4941 Marion Drive Winter Haven Florida 33830.0 \n",
"24 648 Old Dear Lane Port Jervis New York 12771.0 \n",
"25 922 Chapmans Lane Albuquerque NM 87109.0 \n",
"26 1207 Garfield Road Peoria IL 61602.0 \n",
"27 4982 Wood Street Venice LA 70091.0 \n",
"28 2386 Linda Street Fort Washington PA 19034.0 \n",
"29 648 Old Dear Lane Port Jervis New York 12771.0 \n",
".. ... ... ... ... \n",
"473 664 Lyon Avenue South Boston MA 2127.0 \n",
"474 1370 Flint Street Atlanta GA 30303.0 \n",
"475 1270 Haul Road Mountain View California 94041.0 \n",
"476 1815 Garrett Street Philadelphia PA 19108.0 \n",
"477 4435 Poe Road Florence SC 29501.0 \n",
"478 312 Jim Rosa Lane San Jose CA 95134.0 \n",
"479 4720 Gordon Street Ontario California 91762.0 \n",
"480 547 Weekley Street San Antonio TX 78212.0 \n",
"481 1614 Heather Sees Way Tulsa OK 74116.0 \n",
"482 4033 White Avenue Corpus Christi TX 78401.0 \n",
"483 990 Melville Street Memphis TN 38118.0 \n",
"484 1094 Jones Avenue Greensboro NC 28716.0 \n",
"485 3697 Drainer Avenue Fort Walton Beach FL 32548.0 \n",
"486 3488 Clair Street Waco TX 76706.0 \n",
"487 1346 Nicholas Street Ottawa KS 66067.0 \n",
"488 962 George Street Ocala Florida 34471.0 \n",
"489 2607 Water Street Lafayette California 94549.0 \n",
"490 1160 Taylor Street New Rochelle New York 10801.0 \n",
"491 241 Freshour Circle San Antonio TX 78205.0 \n",
"492 3958 Liberty Avenue Burbank California 91505.0 \n",
"493 1826 Poplar Chase Lane Boise ID 83702.0 \n",
"494 4102 Ritter Avenue Roseville MI 48066.0 \n",
"495 4111 Thunder Road San Mateo CA 94403.0 \n",
"496 3868 Freed Drive Stockton California 95204.0 \n",
"497 1179 Patton Lane Tulsa OK 74116.0 \n",
"498 2530 Victoria Court Milton Mills ME 3852.0 \n",
"499 494 Clarksburg Park Road Sedona AZ 86341.0 \n",
"500 649 Nutter Street Overland Park MO 64110.0 \n",
"501 3652 Boone Crockett Lane Seattle WA 98109.0 \n",
"502 2778 North Avenue Burr Nebraska 68324.0 \n",
"\n",
" country contact \\\n",
"0 United States 951-719-9170ZoeWellish@superrito.com \n",
"1 United States PamelaSHill@cuvox.de+1 (217) 569-3204 \n",
"2 United States 402-363-6804JaeMDebord@gustr.com \n",
"3 United States PhanBaLiem@jourrapide.com+1 (732) 636-8246 \n",
"4 United States 334-515-7487TimNeudorf@cuvox.de \n",
"5 United States 386-334-5237RafaelCardosoCosta@gustr.com \n",
"6 United States 775-533-5933MaryBAdams@einrot.com \n",
"7 United States XiuxiuChang@einrot.com1 408 778 3236 \n",
"8 United States 816-265-9578DavidGustafsson@armyspy.com \n",
"9 United States SophieCabreraIbarra@teleworm.us1 718 795 9124 \n",
"10 United States 906-478-8949SandyGunnarsson@dayrep.com \n",
"11 United States Abdul-NurMummarIsa@rhyta.com1 931 207 0839 \n",
"12 United States OmeokachieIbeamaka@einrot.com434-509-2614 \n",
"13 United States AnenechiChidi@armyspy.com+1 (205) 417-8095 \n",
"14 United States AsiaWozniak@rhyta.com918-712-3469 \n",
"15 United States 276-225-1955SrenFLund@gustr.com \n",
"16 United States LieuThiThuTam@dayrep.com1 559 765 7836 \n",
"17 United States RoxanneAndreyeva@armyspy.com678-829-8578 \n",
"18 United States 406-775-2696WilliamVOates@armyspy.com \n",
"19 United States ZakKelly@rhyta.com1 530 532 8397 \n",
"20 United States SofiaTKarlsen@teleworm.us1 781 447 1763 \n",
"21 United States 973-445-5341SamuelGubrandsson@teleworm.us \n",
"22 United States 914-745-6108ManchuSu@einrot.com \n",
"23 United States LovreGalic@gustr.com1 813 355 9476 \n",
"24 United States JakobCJakobsen@einrot.com+1 (845) 858-7707 \n",
"25 United States GregorBole@gustr.com505-828-4955 \n",
"26 United States 309-671-8852EllaLund@armyspy.com \n",
"27 United States 985-814-7603JosephNTucker@rhyta.com \n",
"28 United States RobertWolf@fleckens.hu1 267 895 7462 \n",
"29 United States JakobCJakobsen@einrot.com+1 (845) 858-7707 \n",
".. ... ... \n",
"473 United States KateWilkinson@armyspy.com1 508 905 2371 \n",
"474 United States EsperanzaLabrosse@armyspy.com678-263-3564 \n",
"475 United States MalikVaneker@superrito.com650-962-7179 \n",
"476 United States 267-972-3749BertaNapolitani@rhyta.com \n",
"477 United States JuliuszMajewski@superrito.com+1 (843) 212-6421 \n",
"478 United States EdelmaVillalpandoSantillan@teleworm.us+1 (415)... \n",
"479 United States TapaArsanukayev@dayrep.com1 909 458 2515 \n",
"480 United States NasserMazinMansour@fleckens.hu1 210 326 5509 \n",
"481 United States MichaelKristensen@gustr.com1 918 706 2776 \n",
"482 United States 361-693-4960DiogoBarrosSouza@jourrapide.com \n",
"483 United States 731-577-0292AngelGrant@fleckens.hu \n",
"484 United States 336-697-2005PlacidoUdinesi@dayrep.com \n",
"485 United States TrifonIzmailov@fleckens.hu1 850 659 0417 \n",
"486 United States 254-681-4504SamuelBlix@dayrep.com \n",
"487 United States IvarLofgren@armyspy.com1 785 229 1188 \n",
"488 United States 352-453-4601MikaMartinsson@armyspy.com \n",
"489 United States JasmineSykes@jourrapide.com925-283-5425 \n",
"490 United States 914-636-9304JacksonAddison@armyspy.com \n",
"491 United States 210-222-8684VanessaFerguson@jourrapide.com \n",
"492 United States 714-496-2264TarPoldi@superrito.com \n",
"493 United States FenChin@gustr.com+1 (208) 388-1065 \n",
"494 United States SirkkaPiirainen@teleworm.us+1 (586) 790-0975 \n",
"495 United States 650-570-4896HajimeTsukada@dayrep.com \n",
"496 United States AlexanderHueber@jourrapide.com1 209 762 2320 \n",
"497 United States MasatakaMurakami@einrot.com+1 (918) 984-9171 \n",
"498 United States 207-477-0579MustafaLindstrom@jourrapide.com \n",
"499 United States 928-284-4492RumanBisliev@gustr.com \n",
"500 United States 816-223-6007JinkedeKeizer@teleworm.us \n",
"501 United States ChidaluOnyekaozulu@jourrapide.com1 360 443 2060 \n",
"502 United States PatrickGersten@rhyta.com402-848-4923 \n",
"\n",
" birthdate weight height bmi \n",
"0 7/10/1976 121.7 66 19.6 \n",
"1 4/3/1967 118.8 66 19.2 \n",
"2 2/19/1980 177.8 71 24.8 \n",
"3 7/26/1951 220.9 70 31.7 \n",
"4 2/18/1928 192.3 27 26.1 \n",
"5 8/31/1931 183.9 70 26.4 \n",
"6 11/19/1969 146.3 65 24.3 \n",
"7 8/13/1958 158.0 60 30.9 \n",
"8 3/6/1937 163.9 66 26.5 \n",
"9 12/3/1930 194.7 64 33.4 \n",
"10 7/16/1974 199.3 62 36.4 \n",
"11 2/3/1954 238.7 73 31.5 \n",
"12 8/5/1957 224.2 69 33.1 \n",
"13 3/7/1961 228.4 67 35.8 \n",
"14 8/15/1997 112.0 65 18.6 \n",
"15 8/23/1922 201.5 64 34.6 \n",
"16 11/14/1952 183.9 61 34.7 \n",
"17 7/24/1922 129.1 60 25.2 \n",
"18 9/4/1949 202.2 64 34.7 \n",
"19 12/13/1988 208.8 70 30.0 \n",
"20 9/24/1934 153.1 66 24.7 \n",
"21 4/12/1983 223.7 69 33.0 \n",
"22 1/19/1936 130.7 65 21.7 \n",
"23 5/26/1960 222.9 66 36.0 \n",
"24 8/1/1985 155.8 67 24.4 \n",
"25 6/19/1922 180.8 67 28.3 \n",
"26 12/19/1933 144.8 61 27.4 \n",
"27 4/10/1959 175.8 72 23.8 \n",
"28 6/26/1937 206.6 70 29.6 \n",
"29 8/1/1985 155.8 67 24.4 \n",
".. ... ... ... ... \n",
"473 7/18/1998 175.3 65 29.2 \n",
"474 10/7/1961 181.5 63 32.1 \n",
"475 9/25/1953 214.4 67 33.6 \n",
"476 12/2/1958 153.3 63 27.2 \n",
"477 9/29/1966 212.1 69 31.3 \n",
"478 6/24/1977 109.6 63 19.4 \n",
"479 9/15/1955 220.0 65 36.6 \n",
"480 3/25/1938 183.5 66 29.6 \n",
"481 8/10/1930 154.7 65 25.7 \n",
"482 3/3/1945 220.0 65 36.6 \n",
"483 8/14/1987 123.9 61 23.4 \n",
"484 5/31/1934 175.8 65 29.3 \n",
"485 2/15/1973 255.9 74 32.9 \n",
"486 7/6/1983 211.4 74 27.1 \n",
"487 11/7/1962 242.4 77 28.7 \n",
"488 1/27/1970 165.0 67 25.8 \n",
"489 12/1/1988 187.2 63 33.2 \n",
"490 5/29/1953 192.7 69 28.5 \n",
"491 9/21/1950 149.8 67 23.5 \n",
"492 5/23/1970 184.6 70 26.5 \n",
"493 3/18/1997 195.1 68 29.7 \n",
"494 1/16/1942 126.3 67 19.8 \n",
"495 9/5/1972 168.1 66 27.1 \n",
"496 9/12/1942 194.0 72 26.3 \n",
"497 8/19/1937 155.1 72 21.0 \n",
"498 4/10/1959 181.1 72 24.6 \n",
"499 3/26/1948 239.6 70 34.4 \n",
"500 1/13/1971 171.2 67 26.8 \n",
"501 2/13/1952 176.9 67 27.7 \n",
"502 5/3/1954 138.2 71 19.3 \n",
"\n",
"[503 rows x 14 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>auralin</th>\n",
" <th>novodra</th>\n",
" <th>hba1c_start</th>\n",
" <th>hba1c_end</th>\n",
" <th>hba1c_change</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>veronika</td>\n",
" <td>jindrová</td>\n",
" <td>41u - 48u</td>\n",
" <td>-</td>\n",
" <td>7.63</td>\n",
" <td>7.20</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>elliot</td>\n",
" <td>richardson</td>\n",
" <td>-</td>\n",
" <td>40u - 45u</td>\n",
" <td>7.56</td>\n",
" <td>7.09</td>\n",
" <td>0.97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>yukitaka</td>\n",
" <td>takenaka</td>\n",
" <td>-</td>\n",
" <td>39u - 36u</td>\n",
" <td>7.68</td>\n",
" <td>7.25</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>skye</td>\n",
" <td>gormanston</td>\n",
" <td>33u - 36u</td>\n",
" <td>-</td>\n",
" <td>7.97</td>\n",
" <td>7.62</td>\n",
" <td>0.35</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>alissa</td>\n",
" <td>montez</td>\n",
" <td>-</td>\n",
" <td>33u - 29u</td>\n",
" <td>7.78</td>\n",
" <td>7.46</td>\n",
" <td>0.32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>jasmine</td>\n",
" <td>sykes</td>\n",
" <td>-</td>\n",
" <td>42u - 44u</td>\n",
" <td>7.56</td>\n",
" <td>7.18</td>\n",
" <td>0.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>sophia</td>\n",
" <td>haugen</td>\n",
" <td>37u - 42u</td>\n",
" <td>-</td>\n",
" <td>7.65</td>\n",
" <td>7.27</td>\n",
" <td>0.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>eddie</td>\n",
" <td>archer</td>\n",
" <td>31u - 38u</td>\n",
" <td>-</td>\n",
" <td>7.89</td>\n",
" <td>7.55</td>\n",
" <td>0.34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>saber</td>\n",
" <td>ménard</td>\n",
" <td>-</td>\n",
" <td>54u - 54u</td>\n",
" <td>8.08</td>\n",
" <td>7.70</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>asia</td>\n",
" <td>woźniak</td>\n",
" <td>30u - 36u</td>\n",
" <td>-</td>\n",
" <td>7.76</td>\n",
" <td>7.37</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>joseph</td>\n",
" <td>day</td>\n",
" <td>29u - 36u</td>\n",
" <td>-</td>\n",
" <td>7.70</td>\n",
" <td>7.19</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>kristiina</td>\n",
" <td>hyypiä</td>\n",
" <td>-</td>\n",
" <td>36u - 38u</td>\n",
" <td>7.87</td>\n",
" <td>7.49</td>\n",
" <td>0.38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>roxanne</td>\n",
" <td>andreyeva</td>\n",
" <td>29u - 38u</td>\n",
" <td>-</td>\n",
" <td>9.54</td>\n",
" <td>9.14</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>gregor</td>\n",
" <td>bole</td>\n",
" <td>-</td>\n",
" <td>47u - 45u</td>\n",
" <td>7.61</td>\n",
" <td>7.16</td>\n",
" <td>0.95</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>simone</td>\n",
" <td>baumgaertner</td>\n",
" <td>27u - 37u</td>\n",
" <td>-</td>\n",
" <td>7.74</td>\n",
" <td>7.30</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>enco</td>\n",
" <td>žibrik</td>\n",
" <td>55u - 68u</td>\n",
" <td>-</td>\n",
" <td>7.78</td>\n",
" <td>7.34</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>camilla</td>\n",
" <td>zaitseva</td>\n",
" <td>28u - 37u</td>\n",
" <td>-</td>\n",
" <td>7.53</td>\n",
" <td>7.13</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>gina</td>\n",
" <td>cain</td>\n",
" <td>-</td>\n",
" <td>36u - 36u</td>\n",
" <td>7.88</td>\n",
" <td>7.40</td>\n",
" <td>0.98</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>addolorata</td>\n",
" <td>lombardi</td>\n",
" <td>-</td>\n",
" <td>49u - 46u</td>\n",
" <td>7.75</td>\n",
" <td>7.33</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>khalid</td>\n",
" <td>johnsrud</td>\n",
" <td>-</td>\n",
" <td>54u - 54u</td>\n",
" <td>8.35</td>\n",
" <td>7.94</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>mile</td>\n",
" <td>stanić</td>\n",
" <td>-</td>\n",
" <td>47u - 48u</td>\n",
" <td>7.66</td>\n",
" <td>7.24</td>\n",
" <td>0.92</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>tekla</td>\n",
" <td>walczak</td>\n",
" <td>29u - 39u</td>\n",
" <td>-</td>\n",
" <td>7.61</td>\n",
" <td>7.29</td>\n",
" <td>0.32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>brancaleone</td>\n",
" <td>russo</td>\n",
" <td>53u - 60u</td>\n",
" <td>-</td>\n",
" <td>8.61</td>\n",
" <td>8.18</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>chiemela</td>\n",
" <td>tobeolisa</td>\n",
" <td>-</td>\n",
" <td>43u - 47u</td>\n",
" <td>7.59</td>\n",
" <td>7.17</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>isac</td>\n",
" <td>berg</td>\n",
" <td>31u - 41u</td>\n",
" <td>-</td>\n",
" <td>9.68</td>\n",
" <td>9.29</td>\n",
" <td>0.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>benoît</td>\n",
" <td>bonami</td>\n",
" <td>-</td>\n",
" <td>44u - 43u</td>\n",
" <td>9.82</td>\n",
" <td>9.40</td>\n",
" <td>0.92</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>suhaim</td>\n",
" <td>rahal</td>\n",
" <td>-</td>\n",
" <td>49u - 47u</td>\n",
" <td>7.94</td>\n",
" <td>7.50</td>\n",
" <td>0.94</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>mizuki</td>\n",
" <td>iwata</td>\n",
" <td>-</td>\n",
" <td>45u - 46u</td>\n",
" <td>7.70</td>\n",
" <td>7.23</td>\n",
" <td>0.97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>clinton</td>\n",
" <td>miller</td>\n",
" <td>42u - 51u</td>\n",
" <td>-</td>\n",
" <td>7.79</td>\n",
" <td>7.40</td>\n",
" <td>0.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>eugene</td>\n",
" <td>mironov</td>\n",
" <td>42u - 49u</td>\n",
" <td>-</td>\n",
" <td>7.81</td>\n",
" <td>7.48</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>250</th>\n",
" <td>chen</td>\n",
" <td>yao</td>\n",
" <td>-</td>\n",
" <td>56u - 57u</td>\n",
" <td>7.90</td>\n",
" <td>7.51</td>\n",
" <td>0.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>251</th>\n",
" <td>aksel</td>\n",
" <td>vestergaard</td>\n",
" <td>-</td>\n",
" <td>42u - 38u</td>\n",
" <td>9.62</td>\n",
" <td>9.29</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>252</th>\n",
" <td>ellen</td>\n",
" <td>luman</td>\n",
" <td>-</td>\n",
" <td>40u - 39u</td>\n",
" <td>9.27</td>\n",
" <td>8.77</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>253</th>\n",
" <td>albino</td>\n",
" <td>schiavone</td>\n",
" <td>35u - 43u</td>\n",
" <td>-</td>\n",
" <td>7.56</td>\n",
" <td>7.15</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>254</th>\n",
" <td>jose</td>\n",
" <td>combs</td>\n",
" <td>-</td>\n",
" <td>39u - 36u</td>\n",
" <td>7.89</td>\n",
" <td>7.42</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>255</th>\n",
" <td>jia li</td>\n",
" <td>teng</td>\n",
" <td>48u - 54u</td>\n",
" <td>-</td>\n",
" <td>7.66</td>\n",
" <td>7.32</td>\n",
" <td>0.34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>256</th>\n",
" <td>ilija</td>\n",
" <td>horvat</td>\n",
" <td>42u - 50u</td>\n",
" <td>-</td>\n",
" <td>7.77</td>\n",
" <td>7.38</td>\n",
" <td>0.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>mathilde</td>\n",
" <td>nørgaard</td>\n",
" <td>-</td>\n",
" <td>27u - 28u</td>\n",
" <td>8.50</td>\n",
" <td>8.10</td>\n",
" <td>0.90</td>\n",
" </tr>\n",
" <tr>\n",
" <th>258</th>\n",
" <td>csilla</td>\n",
" <td>herczegh</td>\n",
" <td>-</td>\n",
" <td>43u - 46u</td>\n",
" <td>7.71</td>\n",
" <td>7.27</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>259</th>\n",
" <td>aaliyah</td>\n",
" <td>rice</td>\n",
" <td>-</td>\n",
" <td>31u - 31u</td>\n",
" <td>7.64</td>\n",
" <td>7.33</td>\n",
" <td>0.31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>260</th>\n",
" <td>david</td>\n",
" <td>beauvais</td>\n",
" <td>-</td>\n",
" <td>26u - 23u</td>\n",
" <td>7.87</td>\n",
" <td>7.47</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>261</th>\n",
" <td>caroline</td>\n",
" <td>shuler</td>\n",
" <td>-</td>\n",
" <td>50u - 54u</td>\n",
" <td>7.63</td>\n",
" <td>7.27</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>262</th>\n",
" <td>alex</td>\n",
" <td>crawford</td>\n",
" <td>51u - 62u</td>\n",
" <td>-</td>\n",
" <td>7.69</td>\n",
" <td>7.30</td>\n",
" <td>0.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>263</th>\n",
" <td>rebecca</td>\n",
" <td>jephcott</td>\n",
" <td>53u - 63u</td>\n",
" <td>-</td>\n",
" <td>7.96</td>\n",
" <td>7.57</td>\n",
" <td>0.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>chukwumoge</td>\n",
" <td>ogochukwu</td>\n",
" <td>-</td>\n",
" <td>41u - 39u</td>\n",
" <td>7.95</td>\n",
" <td>7.56</td>\n",
" <td>0.39</td>\n",
" </tr>\n",
" <tr>\n",
" <th>265</th>\n",
" <td>fearne</td>\n",
" <td>mcgregor</td>\n",
" <td>-</td>\n",
" <td>27u - 29u</td>\n",
" <td>7.83</td>\n",
" <td>7.48</td>\n",
" <td>0.35</td>\n",
" </tr>\n",
" <tr>\n",
" <th>266</th>\n",
" <td>ursula</td>\n",
" <td>freud</td>\n",
" <td>42u - 54u</td>\n",
" <td>-</td>\n",
" <td>7.75</td>\n",
" <td>7.46</td>\n",
" <td>0.29</td>\n",
" </tr>\n",
" <tr>\n",
" <th>267</th>\n",
" <td>leon</td>\n",
" <td>scholz</td>\n",
" <td>-</td>\n",
" <td>38u - 32u</td>\n",
" <td>7.72</td>\n",
" <td>7.29</td>\n",
" <td>0.93</td>\n",
" </tr>\n",
" <tr>\n",
" <th>268</th>\n",
" <td>yasmin</td>\n",
" <td>araujo</td>\n",
" <td>-</td>\n",
" <td>51u - 54u</td>\n",
" <td>7.82</td>\n",
" <td>7.36</td>\n",
" <td>0.96</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>hiromu</td>\n",
" <td>horikawa</td>\n",
" <td>-</td>\n",
" <td>47u - 46u</td>\n",
" <td>7.77</td>\n",
" <td>7.28</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>270</th>\n",
" <td>mika</td>\n",
" <td>martinsson</td>\n",
" <td>34u - 43u</td>\n",
" <td>-</td>\n",
" <td>7.50</td>\n",
" <td>7.17</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>271</th>\n",
" <td>leo</td>\n",
" <td>vieira</td>\n",
" <td>-</td>\n",
" <td>30u - 33u</td>\n",
" <td>7.74</td>\n",
" <td>7.36</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>272</th>\n",
" <td>steven</td>\n",
" <td>roy</td>\n",
" <td>-</td>\n",
" <td>41u - 43u</td>\n",
" <td>7.87</td>\n",
" <td>7.43</td>\n",
" <td>0.94</td>\n",
" </tr>\n",
" <tr>\n",
" <th>273</th>\n",
" <td>kate</td>\n",
" <td>wilkinson</td>\n",
" <td>36u - 39u</td>\n",
" <td>-</td>\n",
" <td>7.72</td>\n",
" <td>7.20</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>274</th>\n",
" <td>naja</td>\n",
" <td>enoksen</td>\n",
" <td>43u - 50u</td>\n",
" <td>-</td>\n",
" <td>7.98</td>\n",
" <td>7.59</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>275</th>\n",
" <td>albina</td>\n",
" <td>zetticci</td>\n",
" <td>45u - 51u</td>\n",
" <td>-</td>\n",
" <td>7.93</td>\n",
" <td>7.73</td>\n",
" <td>0.20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>276</th>\n",
" <td>john</td>\n",
" <td>teichelmann</td>\n",
" <td>-</td>\n",
" <td>49u - 49u</td>\n",
" <td>7.90</td>\n",
" <td>7.58</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>277</th>\n",
" <td>mathea</td>\n",
" <td>lillebø</td>\n",
" <td>23u - 36u</td>\n",
" <td>-</td>\n",
" <td>9.04</td>\n",
" <td>8.67</td>\n",
" <td>0.37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>278</th>\n",
" <td>vallie</td>\n",
" <td>prince</td>\n",
" <td>31u - 38u</td>\n",
" <td>-</td>\n",
" <td>7.64</td>\n",
" <td>7.28</td>\n",
" <td>0.36</td>\n",
" </tr>\n",
" <tr>\n",
" <th>279</th>\n",
" <td>samúel</td>\n",
" <td>guðbrandsson</td>\n",
" <td>53u - 56u</td>\n",
" <td>-</td>\n",
" <td>8.00</td>\n",
" <td>7.64</td>\n",
" <td>0.36</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>280 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" given_name surname auralin novodra hba1c_start hba1c_end \\\n",
"0 veronika jindrová 41u - 48u - 7.63 7.20 \n",
"1 elliot richardson - 40u - 45u 7.56 7.09 \n",
"2 yukitaka takenaka - 39u - 36u 7.68 7.25 \n",
"3 skye gormanston 33u - 36u - 7.97 7.62 \n",
"4 alissa montez - 33u - 29u 7.78 7.46 \n",
"5 jasmine sykes - 42u - 44u 7.56 7.18 \n",
"6 sophia haugen 37u - 42u - 7.65 7.27 \n",
"7 eddie archer 31u - 38u - 7.89 7.55 \n",
"8 saber ménard - 54u - 54u 8.08 7.70 \n",
"9 asia woźniak 30u - 36u - 7.76 7.37 \n",
"10 joseph day 29u - 36u - 7.70 7.19 \n",
"11 kristiina hyypiä - 36u - 38u 7.87 7.49 \n",
"12 roxanne andreyeva 29u - 38u - 9.54 9.14 \n",
"13 gregor bole - 47u - 45u 7.61 7.16 \n",
"14 simone baumgaertner 27u - 37u - 7.74 7.30 \n",
"15 enco žibrik 55u - 68u - 7.78 7.34 \n",
"16 camilla zaitseva 28u - 37u - 7.53 7.13 \n",
"17 gina cain - 36u - 36u 7.88 7.40 \n",
"18 addolorata lombardi - 49u - 46u 7.75 7.33 \n",
"19 khalid johnsrud - 54u - 54u 8.35 7.94 \n",
"20 mile stanić - 47u - 48u 7.66 7.24 \n",
"21 tekla walczak 29u - 39u - 7.61 7.29 \n",
"22 brancaleone russo 53u - 60u - 8.61 8.18 \n",
"23 chiemela tobeolisa - 43u - 47u 7.59 7.17 \n",
"24 isac berg 31u - 41u - 9.68 9.29 \n",
"25 benoît bonami - 44u - 43u 9.82 9.40 \n",
"26 suhaim rahal - 49u - 47u 7.94 7.50 \n",
"27 mizuki iwata - 45u - 46u 7.70 7.23 \n",
"28 clinton miller 42u - 51u - 7.79 7.40 \n",
"29 eugene mironov 42u - 49u - 7.81 7.48 \n",
".. ... ... ... ... ... ... \n",
"250 chen yao - 56u - 57u 7.90 7.51 \n",
"251 aksel vestergaard - 42u - 38u 9.62 9.29 \n",
"252 ellen luman - 40u - 39u 9.27 8.77 \n",
"253 albino schiavone 35u - 43u - 7.56 7.15 \n",
"254 jose combs - 39u - 36u 7.89 7.42 \n",
"255 jia li teng 48u - 54u - 7.66 7.32 \n",
"256 ilija horvat 42u - 50u - 7.77 7.38 \n",
"257 mathilde nørgaard - 27u - 28u 8.50 8.10 \n",
"258 csilla herczegh - 43u - 46u 7.71 7.27 \n",
"259 aaliyah rice - 31u - 31u 7.64 7.33 \n",
"260 david beauvais - 26u - 23u 7.87 7.47 \n",
"261 caroline shuler - 50u - 54u 7.63 7.27 \n",
"262 alex crawford 51u - 62u - 7.69 7.30 \n",
"263 rebecca jephcott 53u - 63u - 7.96 7.57 \n",
"264 chukwumoge ogochukwu - 41u - 39u 7.95 7.56 \n",
"265 fearne mcgregor - 27u - 29u 7.83 7.48 \n",
"266 ursula freud 42u - 54u - 7.75 7.46 \n",
"267 leon scholz - 38u - 32u 7.72 7.29 \n",
"268 yasmin araujo - 51u - 54u 7.82 7.36 \n",
"269 hiromu horikawa - 47u - 46u 7.77 7.28 \n",
"270 mika martinsson 34u - 43u - 7.50 7.17 \n",
"271 leo vieira - 30u - 33u 7.74 7.36 \n",
"272 steven roy - 41u - 43u 7.87 7.43 \n",
"273 kate wilkinson 36u - 39u - 7.72 7.20 \n",
"274 naja enoksen 43u - 50u - 7.98 7.59 \n",
"275 albina zetticci 45u - 51u - 7.93 7.73 \n",
"276 john teichelmann - 49u - 49u 7.90 7.58 \n",
"277 mathea lillebø 23u - 36u - 9.04 8.67 \n",
"278 vallie prince 31u - 38u - 7.64 7.28 \n",
"279 samúel guðbrandsson 53u - 56u - 8.00 7.64 \n",
"\n",
" hba1c_change \n",
"0 NaN \n",
"1 0.97 \n",
"2 NaN \n",
"3 0.35 \n",
"4 0.32 \n",
"5 0.38 \n",
"6 0.38 \n",
"7 0.34 \n",
"8 NaN \n",
"9 NaN \n",
"10 NaN \n",
"11 0.38 \n",
"12 NaN \n",
"13 0.95 \n",
"14 NaN \n",
"15 NaN \n",
"16 NaN \n",
"17 0.98 \n",
"18 NaN \n",
"19 NaN \n",
"20 0.92 \n",
"21 0.32 \n",
"22 NaN \n",
"23 NaN \n",
"24 0.39 \n",
"25 0.92 \n",
"26 0.94 \n",
"27 0.97 \n",
"28 0.39 \n",
"29 0.33 \n",
".. ... \n",
"250 0.39 \n",
"251 NaN \n",
"252 0.50 \n",
"253 NaN \n",
"254 NaN \n",
"255 0.34 \n",
"256 0.39 \n",
"257 0.90 \n",
"258 NaN \n",
"259 0.31 \n",
"260 NaN \n",
"261 NaN \n",
"262 0.39 \n",
"263 0.39 \n",
"264 0.39 \n",
"265 0.35 \n",
"266 0.29 \n",
"267 0.93 \n",
"268 0.96 \n",
"269 NaN \n",
"270 0.33 \n",
"271 NaN \n",
"272 0.94 \n",
"273 NaN \n",
"274 NaN \n",
"275 0.20 \n",
"276 NaN \n",
"277 0.37 \n",
"278 0.36 \n",
"279 0.36 \n",
"\n",
"[280 rows x 7 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"treatments"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>adverse_reaction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>berta</td>\n",
" <td>napolitani</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>lena</td>\n",
" <td>baer</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>joseph</td>\n",
" <td>day</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>flavia</td>\n",
" <td>fiorentino</td>\n",
" <td>cough</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>manouck</td>\n",
" <td>wubbels</td>\n",
" <td>throat irritation</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>jasmine</td>\n",
" <td>sykes</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>louise</td>\n",
" <td>johnson</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>albinca</td>\n",
" <td>komavec</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>noe</td>\n",
" <td>aranda</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>sofia</td>\n",
" <td>hermansen</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>tegan</td>\n",
" <td>johnson</td>\n",
" <td>headache</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>abel</td>\n",
" <td>yonatan</td>\n",
" <td>cough</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>abdul-nur</td>\n",
" <td>isa</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>leon</td>\n",
" <td>scholz</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>gabriele</td>\n",
" <td>saenger</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>jia li</td>\n",
" <td>teng</td>\n",
" <td>nausea</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>jakob</td>\n",
" <td>jakobsen</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>christopher</td>\n",
" <td>woodward</td>\n",
" <td>nausea</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>ole</td>\n",
" <td>petersen</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>finley</td>\n",
" <td>chandler</td>\n",
" <td>headache</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>anenechi</td>\n",
" <td>chidi</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>miłosław</td>\n",
" <td>wiśniewski</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>lixue</td>\n",
" <td>hsueh</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>merci</td>\n",
" <td>leroux</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>kang</td>\n",
" <td>mai</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>elliot</td>\n",
" <td>richardson</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>clinton</td>\n",
" <td>miller</td>\n",
" <td>throat irritation</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>idalia</td>\n",
" <td>moore</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>xiuxiu</td>\n",
" <td>chang</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>alex</td>\n",
" <td>crawford</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>monika</td>\n",
" <td>lončar</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>steven</td>\n",
" <td>roy</td>\n",
" <td>headache</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>cecilie</td>\n",
" <td>nilsen</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>krisztina</td>\n",
" <td>magyar</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" given_name surname adverse_reaction\n",
"0 berta napolitani injection site discomfort\n",
"1 lena baer hypoglycemia\n",
"2 joseph day hypoglycemia\n",
"3 flavia fiorentino cough\n",
"4 manouck wubbels throat irritation\n",
"5 jasmine sykes hypoglycemia\n",
"6 louise johnson hypoglycemia\n",
"7 albinca komavec hypoglycemia\n",
"8 noe aranda hypoglycemia\n",
"9 sofia hermansen injection site discomfort\n",
"10 tegan johnson headache\n",
"11 abel yonatan cough\n",
"12 abdul-nur isa hypoglycemia\n",
"13 leon scholz injection site discomfort\n",
"14 gabriele saenger hypoglycemia\n",
"15 jia li teng nausea\n",
"16 jakob jakobsen hypoglycemia\n",
"17 christopher woodward nausea\n",
"18 ole petersen hypoglycemia\n",
"19 finley chandler headache\n",
"20 anenechi chidi hypoglycemia\n",
"21 miłosław wiśniewski injection site discomfort\n",
"22 lixue hsueh injection site discomfort\n",
"23 merci leroux hypoglycemia\n",
"24 kang mai injection site discomfort\n",
"25 elliot richardson hypoglycemia\n",
"26 clinton miller throat irritation\n",
"27 idalia moore hypoglycemia\n",
"28 xiuxiu chang hypoglycemia\n",
"29 alex crawford hypoglycemia\n",
"30 monika lončar hypoglycemia\n",
"31 steven roy headache\n",
"32 cecilie nilsen hypoglycemia\n",
"33 krisztina magyar hypoglycemia"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"adverse_reactions"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 503 entries, 0 to 502\n",
"Data columns (total 14 columns):\n",
"patient_id 503 non-null int64\n",
"assigned_sex 503 non-null object\n",
"given_name 503 non-null object\n",
"surname 503 non-null object\n",
"address 491 non-null object\n",
"city 491 non-null object\n",
"state 491 non-null object\n",
"zip_code 491 non-null float64\n",
"country 491 non-null object\n",
"contact 491 non-null object\n",
"birthdate 503 non-null object\n",
"weight 503 non-null float64\n",
"height 503 non-null int64\n",
"bmi 503 non-null float64\n",
"dtypes: float64(3), int64(2), object(9)\n",
"memory usage: 55.1+ KB\n"
]
}
],
"source": [
"patients.info()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 280 entries, 0 to 279\n",
"Data columns (total 7 columns):\n",
"given_name 280 non-null object\n",
"surname 280 non-null object\n",
"auralin 280 non-null object\n",
"novodra 280 non-null object\n",
"hba1c_start 280 non-null float64\n",
"hba1c_end 280 non-null float64\n",
"hba1c_change 171 non-null float64\n",
"dtypes: float64(3), object(4)\n",
"memory usage: 15.4+ KB\n"
]
}
],
"source": [
"treatments.info()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 34 entries, 0 to 33\n",
"Data columns (total 3 columns):\n",
"given_name 34 non-null object\n",
"surname 34 non-null object\n",
"adverse_reaction 34 non-null object\n",
"dtypes: object(3)\n",
"memory usage: 896.0+ bytes\n"
]
}
],
"source": [
"adverse_reactions.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"14 given_name\n",
"15 surname\n",
"21 given_name\n",
"22 surname\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_columns = pd.Series(list(patients) + list(treatments) + list(adverse_reactions))\n",
"all_columns[all_columns.duplicated()]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['patient_id',\n",
" 'assigned_sex',\n",
" 'given_name',\n",
" 'surname',\n",
" 'address',\n",
" 'city',\n",
" 'state',\n",
" 'zip_code',\n",
" 'country',\n",
" 'contact',\n",
" 'birthdate',\n",
" 'weight',\n",
" 'height',\n",
" 'bmi']"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(patients)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>contact</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>209</th>\n",
" <td>210</td>\n",
" <td>female</td>\n",
" <td>Lalita</td>\n",
" <td>Eldarkhanov</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>8/14/1950</td>\n",
" <td>143.4</td>\n",
" <td>62</td>\n",
" <td>26.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219</th>\n",
" <td>220</td>\n",
" <td>male</td>\n",
" <td>Mỹ</td>\n",
" <td>Quynh</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4/9/1978</td>\n",
" <td>237.8</td>\n",
" <td>69</td>\n",
" <td>35.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>230</th>\n",
" <td>231</td>\n",
" <td>female</td>\n",
" <td>Elisabeth</td>\n",
" <td>Knudsen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9/23/1976</td>\n",
" <td>165.9</td>\n",
" <td>63</td>\n",
" <td>29.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>235</td>\n",
" <td>female</td>\n",
" <td>Martina</td>\n",
" <td>Tománková</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4/7/1936</td>\n",
" <td>199.5</td>\n",
" <td>65</td>\n",
" <td>33.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>242</th>\n",
" <td>243</td>\n",
" <td>male</td>\n",
" <td>John</td>\n",
" <td>O'Brian</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2/25/1957</td>\n",
" <td>205.3</td>\n",
" <td>74</td>\n",
" <td>26.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>250</td>\n",
" <td>male</td>\n",
" <td>Benjamin</td>\n",
" <td>Mehler</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>10/30/1951</td>\n",
" <td>146.5</td>\n",
" <td>69</td>\n",
" <td>21.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>258</td>\n",
" <td>male</td>\n",
" <td>Jin</td>\n",
" <td>Kung</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5/17/1995</td>\n",
" <td>231.7</td>\n",
" <td>69</td>\n",
" <td>34.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>265</td>\n",
" <td>female</td>\n",
" <td>Wafiyyah</td>\n",
" <td>Asfour</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>11/3/1989</td>\n",
" <td>158.6</td>\n",
" <td>63</td>\n",
" <td>28.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>270</td>\n",
" <td>female</td>\n",
" <td>Flavia</td>\n",
" <td>Fiorentino</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>10/9/1937</td>\n",
" <td>175.2</td>\n",
" <td>61</td>\n",
" <td>33.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>278</th>\n",
" <td>279</td>\n",
" <td>female</td>\n",
" <td>Generosa</td>\n",
" <td>Cabán</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>12/16/1962</td>\n",
" <td>124.3</td>\n",
" <td>69</td>\n",
" <td>18.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>286</th>\n",
" <td>287</td>\n",
" <td>male</td>\n",
" <td>Lewis</td>\n",
" <td>Webb</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4/1/1979</td>\n",
" <td>155.3</td>\n",
" <td>68</td>\n",
" <td>23.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>296</th>\n",
" <td>297</td>\n",
" <td>female</td>\n",
" <td>Chỉ</td>\n",
" <td>Lâm</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5/14/1990</td>\n",
" <td>181.1</td>\n",
" <td>63</td>\n",
" <td>32.1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname address city state \\\n",
"209 210 female Lalita Eldarkhanov NaN NaN NaN \n",
"219 220 male Mỹ Quynh NaN NaN NaN \n",
"230 231 female Elisabeth Knudsen NaN NaN NaN \n",
"234 235 female Martina Tománková NaN NaN NaN \n",
"242 243 male John O'Brian NaN NaN NaN \n",
"249 250 male Benjamin Mehler NaN NaN NaN \n",
"257 258 male Jin Kung NaN NaN NaN \n",
"264 265 female Wafiyyah Asfour NaN NaN NaN \n",
"269 270 female Flavia Fiorentino NaN NaN NaN \n",
"278 279 female Generosa Cabán NaN NaN NaN \n",
"286 287 male Lewis Webb NaN NaN NaN \n",
"296 297 female Chỉ Lâm NaN NaN NaN \n",
"\n",
" zip_code country contact birthdate weight height bmi \n",
"209 NaN NaN NaN 8/14/1950 143.4 62 26.2 \n",
"219 NaN NaN NaN 4/9/1978 237.8 69 35.1 \n",
"230 NaN NaN NaN 9/23/1976 165.9 63 29.4 \n",
"234 NaN NaN NaN 4/7/1936 199.5 65 33.2 \n",
"242 NaN NaN NaN 2/25/1957 205.3 74 26.4 \n",
"249 NaN NaN NaN 10/30/1951 146.5 69 21.6 \n",
"257 NaN NaN NaN 5/17/1995 231.7 69 34.2 \n",
"264 NaN NaN NaN 11/3/1989 158.6 63 28.1 \n",
"269 NaN NaN NaN 10/9/1937 175.2 61 33.1 \n",
"278 NaN NaN NaN 12/16/1962 124.3 69 18.4 \n",
"286 NaN NaN NaN 4/1/1979 155.3 68 23.6 \n",
"296 NaN NaN NaN 5/14/1990 181.1 63 32.1 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients[patients['address'].isnull()]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>zip_code</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>503.000000</td>\n",
" <td>491.000000</td>\n",
" <td>503.000000</td>\n",
" <td>503.000000</td>\n",
" <td>503.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>252.000000</td>\n",
" <td>49084.118126</td>\n",
" <td>173.434990</td>\n",
" <td>66.634195</td>\n",
" <td>27.483897</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>145.347859</td>\n",
" <td>30265.807442</td>\n",
" <td>33.916741</td>\n",
" <td>4.411297</td>\n",
" <td>5.276438</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>1002.000000</td>\n",
" <td>48.800000</td>\n",
" <td>27.000000</td>\n",
" <td>17.100000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>126.500000</td>\n",
" <td>21920.500000</td>\n",
" <td>149.300000</td>\n",
" <td>63.000000</td>\n",
" <td>23.300000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>252.000000</td>\n",
" <td>48057.000000</td>\n",
" <td>175.300000</td>\n",
" <td>67.000000</td>\n",
" <td>27.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>377.500000</td>\n",
" <td>75679.000000</td>\n",
" <td>199.500000</td>\n",
" <td>70.000000</td>\n",
" <td>31.750000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>503.000000</td>\n",
" <td>99701.000000</td>\n",
" <td>255.900000</td>\n",
" <td>79.000000</td>\n",
" <td>37.700000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id zip_code weight height bmi\n",
"count 503.000000 491.000000 503.000000 503.000000 503.000000\n",
"mean 252.000000 49084.118126 173.434990 66.634195 27.483897\n",
"std 145.347859 30265.807442 33.916741 4.411297 5.276438\n",
"min 1.000000 1002.000000 48.800000 27.000000 17.100000\n",
"25% 126.500000 21920.500000 149.300000 63.000000 23.300000\n",
"50% 252.000000 48057.000000 175.300000 67.000000 27.200000\n",
"75% 377.500000 75679.000000 199.500000 70.000000 31.750000\n",
"max 503.000000 99701.000000 255.900000 79.000000 37.700000"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients.describe()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>hba1c_start</th>\n",
" <th>hba1c_end</th>\n",
" <th>hba1c_change</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>280.000000</td>\n",
" <td>280.000000</td>\n",
" <td>171.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>7.985929</td>\n",
" <td>7.589286</td>\n",
" <td>0.546023</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>0.568638</td>\n",
" <td>0.569672</td>\n",
" <td>0.279555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>7.500000</td>\n",
" <td>7.010000</td>\n",
" <td>0.200000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>7.660000</td>\n",
" <td>7.270000</td>\n",
" <td>0.340000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>7.800000</td>\n",
" <td>7.420000</td>\n",
" <td>0.380000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>7.970000</td>\n",
" <td>7.570000</td>\n",
" <td>0.920000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>9.950000</td>\n",
" <td>9.580000</td>\n",
" <td>0.990000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" hba1c_start hba1c_end hba1c_change\n",
"count 280.000000 280.000000 171.000000\n",
"mean 7.985929 7.589286 0.546023\n",
"std 0.568638 0.569672 0.279555\n",
"min 7.500000 7.010000 0.200000\n",
"25% 7.660000 7.270000 0.340000\n",
"50% 7.800000 7.420000 0.380000\n",
"75% 7.970000 7.570000 0.920000\n",
"max 9.950000 9.580000 0.990000"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"treatments.describe()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>contact</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>86</th>\n",
" <td>87</td>\n",
" <td>male</td>\n",
" <td>Philemon</td>\n",
" <td>Abdulov</td>\n",
" <td>1341 Chatham Way</td>\n",
" <td>Temple Hills</td>\n",
" <td>MD</td>\n",
" <td>20031.0</td>\n",
" <td>United States</td>\n",
" <td>240-695-5212PhilemonAbdulov@rhyta.com</td>\n",
" <td>7/7/1943</td>\n",
" <td>171.4</td>\n",
" <td>70</td>\n",
" <td>24.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>402</th>\n",
" <td>403</td>\n",
" <td>female</td>\n",
" <td>Manouck</td>\n",
" <td>Wubbels</td>\n",
" <td>4789 Devils Hill Road</td>\n",
" <td>Utica</td>\n",
" <td>MS</td>\n",
" <td>39175.0</td>\n",
" <td>United States</td>\n",
" <td>601-885-6550ManouckWubbels@armyspy.com</td>\n",
" <td>11/3/1964</td>\n",
" <td>201.5</td>\n",
" <td>65</td>\n",
" <td>33.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>364</th>\n",
" <td>365</td>\n",
" <td>female</td>\n",
" <td>Chukwumoge</td>\n",
" <td>Ogochukwu</td>\n",
" <td>4704 Edsel Road</td>\n",
" <td>Sherman Oaks</td>\n",
" <td>CA</td>\n",
" <td>91403.0</td>\n",
" <td>United States</td>\n",
" <td>818-372-7106ChukwumogeOgochukwu@teleworm.us</td>\n",
" <td>9/16/1960</td>\n",
" <td>212.3</td>\n",
" <td>75</td>\n",
" <td>26.5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>108</td>\n",
" <td>male</td>\n",
" <td>Rocco</td>\n",
" <td>Christie</td>\n",
" <td>1373 Wilmar Farm Road</td>\n",
" <td>Mount Airy</td>\n",
" <td>MD</td>\n",
" <td>21771.0</td>\n",
" <td>United States</td>\n",
" <td>240-322-1398RoccoChristie@rhyta.com</td>\n",
" <td>1/16/1987</td>\n",
" <td>167.0</td>\n",
" <td>70</td>\n",
" <td>24.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>123</th>\n",
" <td>124</td>\n",
" <td>male</td>\n",
" <td>Tas</td>\n",
" <td>Fejes</td>\n",
" <td>4386 Badger Pond Lane</td>\n",
" <td>Tampa</td>\n",
" <td>FL</td>\n",
" <td>33634.0</td>\n",
" <td>United States</td>\n",
" <td>727-331-8429FejesTas@jourrapide.com</td>\n",
" <td>12/2/1982</td>\n",
" <td>193.2</td>\n",
" <td>66</td>\n",
" <td>31.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname address \\\n",
"86 87 male Philemon Abdulov 1341 Chatham Way \n",
"402 403 female Manouck Wubbels 4789 Devils Hill Road \n",
"364 365 female Chukwumoge Ogochukwu 4704 Edsel Road \n",
"107 108 male Rocco Christie 1373 Wilmar Farm Road \n",
"123 124 male Tas Fejes 4386 Badger Pond Lane \n",
"\n",
" city state zip_code country \\\n",
"86 Temple Hills MD 20031.0 United States \n",
"402 Utica MS 39175.0 United States \n",
"364 Sherman Oaks CA 91403.0 United States \n",
"107 Mount Airy MD 21771.0 United States \n",
"123 Tampa FL 33634.0 United States \n",
"\n",
" contact birthdate weight height \\\n",
"86 240-695-5212PhilemonAbdulov@rhyta.com 7/7/1943 171.4 70 \n",
"402 601-885-6550ManouckWubbels@armyspy.com 11/3/1964 201.5 65 \n",
"364 818-372-7106ChukwumogeOgochukwu@teleworm.us 9/16/1960 212.3 75 \n",
"107 240-322-1398RoccoChristie@rhyta.com 1/16/1987 167.0 70 \n",
"123 727-331-8429FejesTas@jourrapide.com 12/2/1982 193.2 66 \n",
"\n",
" bmi \n",
"86 24.6 \n",
"402 33.5 \n",
"364 26.5 \n",
"107 24.0 \n",
"123 31.2 "
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Doe 6\n",
"Taylor 3\n",
"Jakobsen 3\n",
"Lâm 2\n",
"Silva 2\n",
"Grímsdóttir 2\n",
"Dratchev 2\n",
"Cindrić 2\n",
"Batukayev 2\n",
"Hueber 2\n",
"Parker 2\n",
"Kadyrov 2\n",
"Ogochukwu 2\n",
"Gersten 2\n",
"Johnson 2\n",
"Kowalczyk 2\n",
"Berg 2\n",
"Schiavone 2\n",
"Woźniak 2\n",
"Lund 2\n",
"Correia 2\n",
"Liễu 2\n",
"Aranda 2\n",
"Collins 2\n",
"Bùi 2\n",
"Souza 2\n",
"Nilsen 2\n",
"Tucker 2\n",
"Lương 2\n",
"Cabrera 2\n",
" ..\n",
"Tuma 1\n",
"Tobeolisa 1\n",
"Novosel 1\n",
"Totth 1\n",
"Kos 1\n",
"McGregor 1\n",
"Madrid 1\n",
"Mancini 1\n",
"Shuler 1\n",
"Klobučar 1\n",
"Galić 1\n",
"Lončar 1\n",
"Moore 1\n",
"Vaneker 1\n",
"Uspenskaya 1\n",
"Hsu 1\n",
"Komavec 1\n",
"Gyenes 1\n",
"Pecinová 1\n",
"Sleiman 1\n",
"Isa 1\n",
"Mortensen 1\n",
"Lê 1\n",
"Afanasyeva 1\n",
"Hill 1\n",
"Vaara 1\n",
"Reilly 1\n",
"Yonatan 1\n",
"Henzen 1\n",
"Musliyevich 1\n",
"Name: surname, Length: 466, dtype: int64"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients.surname.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"123 Main Street 6\n",
"2778 North Avenue 2\n",
"648 Old Dear Lane 2\n",
"2476 Fulton Street 2\n",
"570 Alpha Avenue 1\n",
"3072 Braxton Street 1\n",
"4851 Andy Street 1\n",
"441 Tibbs Avenue 1\n",
"3499 Baker Avenue 1\n",
"192 Patton Lane 1\n",
"649 Nutter Street 1\n",
"2645 Moore Avenue 1\n",
"1079 Ingram Street 1\n",
"1330 Lincoln Street 1\n",
"2931 Romano Street 1\n",
"1731 Chandler Drive 1\n",
"3542 Robinson Court 1\n",
"3008 Walkers Ridge Way 1\n",
"494 Clarksburg Park Road 1\n",
"3113 Timber Ridge Road 1\n",
"475 Preston Street 1\n",
"4649 Worley Avenue 1\n",
"2578 Tenmile 1\n",
"1774 George Avenue 1\n",
"4310 Johnson Street 1\n",
"353 Whaley Lane 1\n",
"4111 Thunder Road 1\n",
"4386 Camden Street 1\n",
"4500 Myra Street 1\n",
"783 Callison Lane 1\n",
" ..\n",
"4839 North Avenue 1\n",
"577 Chipmunk Lane 1\n",
"2356 Myra Street 1\n",
"3141 Brentwood Drive 1\n",
"4220 Simpson Square 1\n",
"3006 Maple Court 1\n",
"1463 Martha Ellen Drive 1\n",
"3781 Hamill Avenue 1\n",
"3402 Kildeer Drive 1\n",
"954 Summit Park Avenue 1\n",
"2775 Single Street 1\n",
"1368 Yorkshire Circle 1\n",
"2270 Bel Meadow Drive 1\n",
"1934 August Lane 1\n",
"1942 Harry Place 1\n",
"4093 Smith Street 1\n",
"2813 Frederick Street 1\n",
"2324 Benson Street 1\n",
"3391 Marcus Street 1\n",
"2970 Forest Avenue 1\n",
"550 Cliffside Drive 1\n",
"1373 Wilmar Farm Road 1\n",
"1815 Garrett Street 1\n",
"3868 Freed Drive 1\n",
"212 Tibbs Avenue 1\n",
"3595 Stuart Street 1\n",
"1526 Tully Street 1\n",
"2146 Willow Greene Drive 1\n",
"4103 Musgrave Street 1\n",
"3314 Rocket Drive 1\n",
"Name: address, Length: 483, dtype: int64"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients.address.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>contact</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>30</td>\n",
" <td>male</td>\n",
" <td>Jake</td>\n",
" <td>Jakobsen</td>\n",
" <td>648 Old Dear Lane</td>\n",
" <td>Port Jervis</td>\n",
" <td>New York</td>\n",
" <td>12771.0</td>\n",
" <td>United States</td>\n",
" <td>JakobCJakobsen@einrot.com+1 (845) 858-7707</td>\n",
" <td>8/1/1985</td>\n",
" <td>155.8</td>\n",
" <td>67</td>\n",
" <td>24.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>219</th>\n",
" <td>220</td>\n",
" <td>male</td>\n",
" <td>Mỹ</td>\n",
" <td>Quynh</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4/9/1978</td>\n",
" <td>237.8</td>\n",
" <td>69</td>\n",
" <td>35.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>229</th>\n",
" <td>230</td>\n",
" <td>male</td>\n",
" <td>John</td>\n",
" <td>Doe</td>\n",
" <td>123 Main Street</td>\n",
" <td>New York</td>\n",
" <td>NY</td>\n",
" <td>12345.0</td>\n",
" <td>United States</td>\n",
" <td>johndoe@email.com1234567890</td>\n",
" <td>1/1/1975</td>\n",
" <td>180.0</td>\n",
" <td>72</td>\n",
" <td>24.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>230</th>\n",
" <td>231</td>\n",
" <td>female</td>\n",
" <td>Elisabeth</td>\n",
" <td>Knudsen</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>9/23/1976</td>\n",
" <td>165.9</td>\n",
" <td>63</td>\n",
" <td>29.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>234</th>\n",
" <td>235</td>\n",
" <td>female</td>\n",
" <td>Martina</td>\n",
" <td>Tománková</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4/7/1936</td>\n",
" <td>199.5</td>\n",
" <td>65</td>\n",
" <td>33.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>237</th>\n",
" <td>238</td>\n",
" <td>male</td>\n",
" <td>John</td>\n",
" <td>Doe</td>\n",
" <td>123 Main Street</td>\n",
" <td>New York</td>\n",
" <td>NY</td>\n",
" <td>12345.0</td>\n",
" <td>United States</td>\n",
" <td>johndoe@email.com1234567890</td>\n",
" <td>1/1/1975</td>\n",
" <td>180.0</td>\n",
" <td>72</td>\n",
" <td>24.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>242</th>\n",
" <td>243</td>\n",
" <td>male</td>\n",
" <td>John</td>\n",
" <td>O'Brian</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2/25/1957</td>\n",
" <td>205.3</td>\n",
" <td>74</td>\n",
" <td>26.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>244</th>\n",
" <td>245</td>\n",
" <td>male</td>\n",
" <td>John</td>\n",
" <td>Doe</td>\n",
" <td>123 Main Street</td>\n",
" <td>New York</td>\n",
" <td>NY</td>\n",
" <td>12345.0</td>\n",
" <td>United States</td>\n",
" <td>johndoe@email.com1234567890</td>\n",
" <td>1/1/1975</td>\n",
" <td>180.0</td>\n",
" <td>72</td>\n",
" <td>24.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>249</th>\n",
" <td>250</td>\n",
" <td>male</td>\n",
" <td>Benjamin</td>\n",
" <td>Mehler</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>10/30/1951</td>\n",
" <td>146.5</td>\n",
" <td>69</td>\n",
" <td>21.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>251</th>\n",
" <td>252</td>\n",
" <td>male</td>\n",
" <td>John</td>\n",
" <td>Doe</td>\n",
" <td>123 Main Street</td>\n",
" <td>New York</td>\n",
" <td>NY</td>\n",
" <td>12345.0</td>\n",
" <td>United States</td>\n",
" <td>johndoe@email.com1234567890</td>\n",
" <td>1/1/1975</td>\n",
" <td>180.0</td>\n",
" <td>72</td>\n",
" <td>24.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>257</th>\n",
" <td>258</td>\n",
" <td>male</td>\n",
" <td>Jin</td>\n",
" <td>Kung</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5/17/1995</td>\n",
" <td>231.7</td>\n",
" <td>69</td>\n",
" <td>34.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>264</th>\n",
" <td>265</td>\n",
" <td>female</td>\n",
" <td>Wafiyyah</td>\n",
" <td>Asfour</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>11/3/1989</td>\n",
" <td>158.6</td>\n",
" <td>63</td>\n",
" <td>28.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>269</th>\n",
" <td>270</td>\n",
" <td>female</td>\n",
" <td>Flavia</td>\n",
" <td>Fiorentino</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>10/9/1937</td>\n",
" <td>175.2</td>\n",
" <td>61</td>\n",
" <td>33.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>277</th>\n",
" <td>278</td>\n",
" <td>male</td>\n",
" <td>John</td>\n",
" <td>Doe</td>\n",
" <td>123 Main Street</td>\n",
" <td>New York</td>\n",
" <td>NY</td>\n",
" <td>12345.0</td>\n",
" <td>United States</td>\n",
" <td>johndoe@email.com1234567890</td>\n",
" <td>1/1/1975</td>\n",
" <td>180.0</td>\n",
" <td>72</td>\n",
" <td>24.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>278</th>\n",
" <td>279</td>\n",
" <td>female</td>\n",
" <td>Generosa</td>\n",
" <td>Cabán</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>12/16/1962</td>\n",
" <td>124.3</td>\n",
" <td>69</td>\n",
" <td>18.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>282</th>\n",
" <td>283</td>\n",
" <td>female</td>\n",
" <td>Sandy</td>\n",
" <td>Taylor</td>\n",
" <td>2476 Fulton Street</td>\n",
" <td>Rainelle</td>\n",
" <td>WV</td>\n",
" <td>25962.0</td>\n",
" <td>United States</td>\n",
" <td>304-438-2648SandraCTaylor@dayrep.com</td>\n",
" <td>10/23/1960</td>\n",
" <td>206.1</td>\n",
" <td>64</td>\n",
" <td>35.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>286</th>\n",
" <td>287</td>\n",
" <td>male</td>\n",
" <td>Lewis</td>\n",
" <td>Webb</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4/1/1979</td>\n",
" <td>155.3</td>\n",
" <td>68</td>\n",
" <td>23.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>296</th>\n",
" <td>297</td>\n",
" <td>female</td>\n",
" <td>Chỉ</td>\n",
" <td>Lâm</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5/14/1990</td>\n",
" <td>181.1</td>\n",
" <td>63</td>\n",
" <td>32.1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>502</th>\n",
" <td>503</td>\n",
" <td>male</td>\n",
" <td>Pat</td>\n",
" <td>Gersten</td>\n",
" <td>2778 North Avenue</td>\n",
" <td>Burr</td>\n",
" <td>Nebraska</td>\n",
" <td>68324.0</td>\n",
" <td>United States</td>\n",
" <td>PatrickGersten@rhyta.com402-848-4923</td>\n",
" <td>5/3/1954</td>\n",
" <td>138.2</td>\n",
" <td>71</td>\n",
" <td>19.3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname address \\\n",
"29 30 male Jake Jakobsen 648 Old Dear Lane \n",
"219 220 male Mỹ Quynh NaN \n",
"229 230 male John Doe 123 Main Street \n",
"230 231 female Elisabeth Knudsen NaN \n",
"234 235 female Martina Tománková NaN \n",
"237 238 male John Doe 123 Main Street \n",
"242 243 male John O'Brian NaN \n",
"244 245 male John Doe 123 Main Street \n",
"249 250 male Benjamin Mehler NaN \n",
"251 252 male John Doe 123 Main Street \n",
"257 258 male Jin Kung NaN \n",
"264 265 female Wafiyyah Asfour NaN \n",
"269 270 female Flavia Fiorentino NaN \n",
"277 278 male John Doe 123 Main Street \n",
"278 279 female Generosa Cabán NaN \n",
"282 283 female Sandy Taylor 2476 Fulton Street \n",
"286 287 male Lewis Webb NaN \n",
"296 297 female Chỉ Lâm NaN \n",
"502 503 male Pat Gersten 2778 North Avenue \n",
"\n",
" city state zip_code country \\\n",
"29 Port Jervis New York 12771.0 United States \n",
"219 NaN NaN NaN NaN \n",
"229 New York NY 12345.0 United States \n",
"230 NaN NaN NaN NaN \n",
"234 NaN NaN NaN NaN \n",
"237 New York NY 12345.0 United States \n",
"242 NaN NaN NaN NaN \n",
"244 New York NY 12345.0 United States \n",
"249 NaN NaN NaN NaN \n",
"251 New York NY 12345.0 United States \n",
"257 NaN NaN NaN NaN \n",
"264 NaN NaN NaN NaN \n",
"269 NaN NaN NaN NaN \n",
"277 New York NY 12345.0 United States \n",
"278 NaN NaN NaN NaN \n",
"282 Rainelle WV 25962.0 United States \n",
"286 NaN NaN NaN NaN \n",
"296 NaN NaN NaN NaN \n",
"502 Burr Nebraska 68324.0 United States \n",
"\n",
" contact birthdate weight height \\\n",
"29 JakobCJakobsen@einrot.com+1 (845) 858-7707 8/1/1985 155.8 67 \n",
"219 NaN 4/9/1978 237.8 69 \n",
"229 johndoe@email.com1234567890 1/1/1975 180.0 72 \n",
"230 NaN 9/23/1976 165.9 63 \n",
"234 NaN 4/7/1936 199.5 65 \n",
"237 johndoe@email.com1234567890 1/1/1975 180.0 72 \n",
"242 NaN 2/25/1957 205.3 74 \n",
"244 johndoe@email.com1234567890 1/1/1975 180.0 72 \n",
"249 NaN 10/30/1951 146.5 69 \n",
"251 johndoe@email.com1234567890 1/1/1975 180.0 72 \n",
"257 NaN 5/17/1995 231.7 69 \n",
"264 NaN 11/3/1989 158.6 63 \n",
"269 NaN 10/9/1937 175.2 61 \n",
"277 johndoe@email.com1234567890 1/1/1975 180.0 72 \n",
"278 NaN 12/16/1962 124.3 69 \n",
"282 304-438-2648SandraCTaylor@dayrep.com 10/23/1960 206.1 64 \n",
"286 NaN 4/1/1979 155.3 68 \n",
"296 NaN 5/14/1990 181.1 63 \n",
"502 PatrickGersten@rhyta.com402-848-4923 5/3/1954 138.2 71 \n",
"\n",
" bmi \n",
"29 24.4 \n",
"219 35.1 \n",
"229 24.4 \n",
"230 29.4 \n",
"234 33.2 \n",
"237 24.4 \n",
"242 26.4 \n",
"244 24.4 \n",
"249 21.6 \n",
"251 24.4 \n",
"257 34.2 \n",
"264 28.1 \n",
"269 33.1 \n",
"277 24.4 \n",
"278 18.4 \n",
"282 35.4 \n",
"286 23.6 \n",
"296 32.1 \n",
"502 19.3 "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients[patients.address.duplicated()]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"210 48.8\n",
"459 102.1\n",
"335 102.7\n",
"74 103.2\n",
"317 106.0\n",
"171 106.5\n",
"51 107.1\n",
"270 108.1\n",
"198 108.5\n",
"48 109.1\n",
"478 109.6\n",
"141 110.2\n",
"38 111.8\n",
"438 112.0\n",
"14 112.0\n",
"235 112.2\n",
"307 112.4\n",
"191 112.6\n",
"408 113.1\n",
"49 113.3\n",
"326 114.0\n",
"338 114.1\n",
"253 117.0\n",
"321 118.4\n",
"168 118.8\n",
"1 118.8\n",
"350 119.0\n",
"207 119.2\n",
"265 120.0\n",
"341 120.3\n",
" ... \n",
"332 224.0\n",
"252 224.2\n",
"12 224.2\n",
"222 224.8\n",
"166 225.3\n",
"111 225.9\n",
"101 226.2\n",
"150 226.6\n",
"352 227.7\n",
"428 227.7\n",
"88 227.7\n",
"13 228.4\n",
"339 229.0\n",
"182 230.3\n",
"121 230.8\n",
"257 231.7\n",
"395 231.9\n",
"246 232.1\n",
"219 237.8\n",
"11 238.7\n",
"50 238.9\n",
"441 239.1\n",
"499 239.6\n",
"439 242.0\n",
"487 242.4\n",
"144 244.9\n",
"61 244.9\n",
"283 245.5\n",
"118 254.5\n",
"485 255.9\n",
"Name: weight, Length: 503, dtype: float64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients.weight.sort_values()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"210 19.055827\n",
"dtype: float64"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weight_lbs = patients[patients.surname == 'Zaitseva'].weight * 2.20462\n",
"height_in = patients[patients.surname == 'Zaitseva'].height\n",
"bmi_check = 703 * weight_lbs / (height_in * height_in)\n",
"bmi_check"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"210 19.1\n",
"Name: bmi, dtype: float64"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients[patients.surname == 'Zaitseva'].bmi"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(treatments.auralin.isnull())"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum(treatments.novodra.isnull())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Quality\n",
"##### `patients` table\n",
"- Zip code is a float not a string\n",
"- Zip code has four digits sometimes\n",
"- Tim Neudorf height is 27 in instead of 72 in\n",
"- Full state names sometimes, abbreviations other times\n",
"- Dsvid Gustafsson\n",
"- Missing demographic information (address - contact columns) ***(can't clean yet)***\n",
"- Erroneous datatypes (assigned sex, state, zip_code, and birthdate columns)\n",
"- Multiple phone number formats\n",
"- Default John Doe data\n",
"- Multiple records for Jakobsen, Gersten, Taylor\n",
"- kgs instead of lbs for Zaitseva weight\n",
"\n",
"##### `treatments` table\n",
"- Missing HbA1c changes\n",
"- The letter 'u' in starting and ending doses for Auralin and Novodra\n",
"- Lowercase given names and surnames\n",
"- Missing records (280 instead of 350)\n",
"- Erroneous datatypes (auralin and novodra columns)\n",
"- Inaccurate HbA1c changes (leading 4s mistaken as 9s)\n",
"- Nulls represented as dashes (-) in auralin and novodra columns\n",
"\n",
"##### `adverse_reactions` table\n",
"- Lowercase given names and surnames"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Tidiness\n",
"- Contact column in `patients` table should be split into phone number and email\n",
"- Three variables in two columns in `treatments` table (treatment, start dose and end dose)\n",
"- Adverse reaction should be part of the `treatments` table\n",
"- Given name and surname columns in `patients` table duplicated in `treatments` and `adverse_reactions` tables"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clean"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"patients_clean = patients.copy()\n",
"treatments_clean = treatments.copy()\n",
"adverse_reactions_clean = adverse_reactions.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Missing Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### `treatments`: Missing records (280 instead of 350)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Import the cut treatments into a DataFrame and concatenate it with the original treatments DataFrame."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"treatments_cut = pd.read_csv('treatments_cut.csv')\n",
"treatments_clean = pd.concat([treatments_clean, treatments_cut],\n",
" ignore_index=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>auralin</th>\n",
" <th>novodra</th>\n",
" <th>hba1c_start</th>\n",
" <th>hba1c_end</th>\n",
" <th>hba1c_change</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>veronika</td>\n",
" <td>jindrová</td>\n",
" <td>41u - 48u</td>\n",
" <td>-</td>\n",
" <td>7.63</td>\n",
" <td>7.20</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>elliot</td>\n",
" <td>richardson</td>\n",
" <td>-</td>\n",
" <td>40u - 45u</td>\n",
" <td>7.56</td>\n",
" <td>7.09</td>\n",
" <td>0.97</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>yukitaka</td>\n",
" <td>takenaka</td>\n",
" <td>-</td>\n",
" <td>39u - 36u</td>\n",
" <td>7.68</td>\n",
" <td>7.25</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>skye</td>\n",
" <td>gormanston</td>\n",
" <td>33u - 36u</td>\n",
" <td>-</td>\n",
" <td>7.97</td>\n",
" <td>7.62</td>\n",
" <td>0.35</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>alissa</td>\n",
" <td>montez</td>\n",
" <td>-</td>\n",
" <td>33u - 29u</td>\n",
" <td>7.78</td>\n",
" <td>7.46</td>\n",
" <td>0.32</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" given_name surname auralin novodra hba1c_start hba1c_end \\\n",
"0 veronika jindrová 41u - 48u - 7.63 7.20 \n",
"1 elliot richardson - 40u - 45u 7.56 7.09 \n",
"2 yukitaka takenaka - 39u - 36u 7.68 7.25 \n",
"3 skye gormanston 33u - 36u - 7.97 7.62 \n",
"4 alissa montez - 33u - 29u 7.78 7.46 \n",
"\n",
" hba1c_change \n",
"0 NaN \n",
"1 0.97 \n",
"2 NaN \n",
"3 0.35 \n",
"4 0.32 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"treatments_clean.head()"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>auralin</th>\n",
" <th>novodra</th>\n",
" <th>hba1c_start</th>\n",
" <th>hba1c_end</th>\n",
" <th>hba1c_change</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>345</th>\n",
" <td>rovzan</td>\n",
" <td>kishiev</td>\n",
" <td>32u - 37u</td>\n",
" <td>-</td>\n",
" <td>7.75</td>\n",
" <td>7.41</td>\n",
" <td>0.34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>346</th>\n",
" <td>jakob</td>\n",
" <td>jakobsen</td>\n",
" <td>-</td>\n",
" <td>28u - 26u</td>\n",
" <td>7.96</td>\n",
" <td>7.51</td>\n",
" <td>0.95</td>\n",
" </tr>\n",
" <tr>\n",
" <th>347</th>\n",
" <td>bernd</td>\n",
" <td>schneider</td>\n",
" <td>48u - 56u</td>\n",
" <td>-</td>\n",
" <td>7.74</td>\n",
" <td>7.44</td>\n",
" <td>0.30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>348</th>\n",
" <td>berta</td>\n",
" <td>napolitani</td>\n",
" <td>-</td>\n",
" <td>42u - 44u</td>\n",
" <td>7.68</td>\n",
" <td>7.21</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>349</th>\n",
" <td>armina</td>\n",
" <td>sauvé</td>\n",
" <td>36u - 46u</td>\n",
" <td>-</td>\n",
" <td>7.86</td>\n",
" <td>7.40</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" given_name surname auralin novodra hba1c_start hba1c_end \\\n",
"345 rovzan kishiev 32u - 37u - 7.75 7.41 \n",
"346 jakob jakobsen - 28u - 26u 7.96 7.51 \n",
"347 bernd schneider 48u - 56u - 7.74 7.44 \n",
"348 berta napolitani - 42u - 44u 7.68 7.21 \n",
"349 armina sauvé 36u - 46u - 7.86 7.40 \n",
"\n",
" hba1c_change \n",
"345 0.34 \n",
"346 0.95 \n",
"347 0.30 \n",
"348 NaN \n",
"349 NaN "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"treatments_clean.tail()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### `treatments`: Missing HbA1c changes and Inaccurate HbA1c changes (leading 4s mistaken as 9s)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Recalculate the `hba1c_change` column: `hba1c_start` minus `hba1c_end`. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"treatments_clean.hba1c_change = (treatments_clean.hba1c_start - \n",
" treatments_clean.hba1c_end)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.43\n",
"1 0.47\n",
"2 0.43\n",
"3 0.35\n",
"4 0.32\n",
"Name: hba1c_change, dtype: float64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"treatments_clean.hba1c_change.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Tidiness"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Contact column in `patients` table contains two variables: phone number and email"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Extract the *phone number* and *email* variables from the *contact* column using regular expressions and pandas' `str.extract` method. Drop the *contact* column when done."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"patients_clean['phone_number'] = patients_clean.contact.str.extract('((?:\\+\\d{1,2}\\s)?\\(?\\d{3}\\)?[\\s.-]?\\d{3}[\\s.-]?\\d{4})', expand=True)\n",
"patients_clean['email'] = patients_clean.contact.str.extract('([a-zA-Z][a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\\.[a-zA-Z0-9-.][a-zA-Z]+)', expand=True)\n",
"# Note: axis=1 denotes that we are referring to a column, not a row\n",
"patients_clean = patients_clean.drop('contact', axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['patient_id',\n",
" 'assigned_sex',\n",
" 'given_name',\n",
" 'surname',\n",
" 'address',\n",
" 'city',\n",
" 'state',\n",
" 'zip_code',\n",
" 'country',\n",
" 'birthdate',\n",
" 'weight',\n",
" 'height',\n",
" 'bmi',\n",
" 'phone_number',\n",
" 'email']"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Confirm contact column is gone\n",
"list(patients_clean)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"420 631-479-8171\n",
"278 NaN\n",
"129 631-370-7406\n",
"201 603 773 2333\n",
"443 352-362-5392\n",
"479 909 458 2515\n",
"482 361-693-4960\n",
"461 646-289-4177\n",
"58 412-640-7035\n",
"501 360 443 2060\n",
"6 775-533-5933\n",
"291 302-698-2057\n",
"428 412-319-0903\n",
"183 909-355-9418\n",
"38 978 460 9060\n",
"322 254-546-2728\n",
"307 228-237-2271\n",
"151 916-224-7868\n",
"26 309-671-8852\n",
"161 406-759-6160\n",
"160 602-993-7880\n",
"283 +1 (708) 845-2053\n",
"92 609-914-8473\n",
"446 601-418-0102\n",
"419 317-956-6166\n",
"Name: phone_number, dtype: object"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean.phone_number.sample(25)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"404 AaliyahRice@dayrep.com\n",
"11 Abdul-NurMummarIsa@rhyta.com\n",
"332 AbelEfrem@fleckens.hu\n",
"258 AbelYonatan@teleworm.us\n",
"305 AddolorataLombardi@jourrapide.com\n",
"Name: email, dtype: object"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Confirm that no emails start with an integer (regex didn't match for this)\n",
"patients_clean.email.sort_values().head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Three variables in two columns in `treatments` table (treatment, start dose and end dose)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Melt the *auralin* and *novodra* columns to a *treatment* and a *dose* column (dose will still contain both start and end dose at this point). Then split the dose column on ' - ' to obtain *start_dose* and *end_dose* columns. Drop the intermediate *dose* column."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"treatments_clean = pd.melt(treatments_clean, id_vars=['given_name', 'surname', 'hba1c_start', 'hba1c_end', 'hba1c_change'],\n",
" var_name='treatment', value_name='dose')\n",
"treatments_clean = treatments_clean[treatments_clean.dose != \"-\"]\n",
"treatments_clean['dose_start'], treatments_clean['dose_end'] = treatments_clean['dose'].str.split(' - ', 1).str\n",
"treatments_clean = treatments_clean.drop('dose', axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>hba1c_start</th>\n",
" <th>hba1c_end</th>\n",
" <th>hba1c_change</th>\n",
" <th>treatment</th>\n",
" <th>dose_start</th>\n",
" <th>dose_end</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>veronika</td>\n",
" <td>jindrová</td>\n",
" <td>7.63</td>\n",
" <td>7.20</td>\n",
" <td>0.43</td>\n",
" <td>auralin</td>\n",
" <td>41u</td>\n",
" <td>48u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>skye</td>\n",
" <td>gormanston</td>\n",
" <td>7.97</td>\n",
" <td>7.62</td>\n",
" <td>0.35</td>\n",
" <td>auralin</td>\n",
" <td>33u</td>\n",
" <td>36u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>sophia</td>\n",
" <td>haugen</td>\n",
" <td>7.65</td>\n",
" <td>7.27</td>\n",
" <td>0.38</td>\n",
" <td>auralin</td>\n",
" <td>37u</td>\n",
" <td>42u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>eddie</td>\n",
" <td>archer</td>\n",
" <td>7.89</td>\n",
" <td>7.55</td>\n",
" <td>0.34</td>\n",
" <td>auralin</td>\n",
" <td>31u</td>\n",
" <td>38u</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>asia</td>\n",
" <td>woźniak</td>\n",
" <td>7.76</td>\n",
" <td>7.37</td>\n",
" <td>0.39</td>\n",
" <td>auralin</td>\n",
" <td>30u</td>\n",
" <td>36u</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" given_name surname hba1c_start hba1c_end hba1c_change treatment \\\n",
"0 veronika jindrová 7.63 7.20 0.43 auralin \n",
"3 skye gormanston 7.97 7.62 0.35 auralin \n",
"6 sophia haugen 7.65 7.27 0.38 auralin \n",
"7 eddie archer 7.89 7.55 0.34 auralin \n",
"9 asia woźniak 7.76 7.37 0.39 auralin \n",
"\n",
" dose_start dose_end \n",
"0 41u 48u \n",
"3 33u 36u \n",
"6 37u 42u \n",
"7 31u 38u \n",
"9 30u 36u "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"treatments_clean.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Adverse reaction should be part of the `treatments` table"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Merge the *adverse_reaction* column to the `treatments` table, joining on *given name* and *surname*."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"treatments_clean = pd.merge(treatments_clean, adverse_reactions_clean,\n",
" on=['given_name', 'surname'], how='left')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>hba1c_start</th>\n",
" <th>hba1c_end</th>\n",
" <th>hba1c_change</th>\n",
" <th>treatment</th>\n",
" <th>dose_start</th>\n",
" <th>dose_end</th>\n",
" <th>adverse_reaction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>veronika</td>\n",
" <td>jindrová</td>\n",
" <td>7.63</td>\n",
" <td>7.20</td>\n",
" <td>0.43</td>\n",
" <td>auralin</td>\n",
" <td>41u</td>\n",
" <td>48u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>skye</td>\n",
" <td>gormanston</td>\n",
" <td>7.97</td>\n",
" <td>7.62</td>\n",
" <td>0.35</td>\n",
" <td>auralin</td>\n",
" <td>33u</td>\n",
" <td>36u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>sophia</td>\n",
" <td>haugen</td>\n",
" <td>7.65</td>\n",
" <td>7.27</td>\n",
" <td>0.38</td>\n",
" <td>auralin</td>\n",
" <td>37u</td>\n",
" <td>42u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>eddie</td>\n",
" <td>archer</td>\n",
" <td>7.89</td>\n",
" <td>7.55</td>\n",
" <td>0.34</td>\n",
" <td>auralin</td>\n",
" <td>31u</td>\n",
" <td>38u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>asia</td>\n",
" <td>woźniak</td>\n",
" <td>7.76</td>\n",
" <td>7.37</td>\n",
" <td>0.39</td>\n",
" <td>auralin</td>\n",
" <td>30u</td>\n",
" <td>36u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>joseph</td>\n",
" <td>day</td>\n",
" <td>7.70</td>\n",
" <td>7.19</td>\n",
" <td>0.51</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>36u</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>roxanne</td>\n",
" <td>andreyeva</td>\n",
" <td>9.54</td>\n",
" <td>9.14</td>\n",
" <td>0.40</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>38u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>simone</td>\n",
" <td>baumgaertner</td>\n",
" <td>7.74</td>\n",
" <td>7.30</td>\n",
" <td>0.44</td>\n",
" <td>auralin</td>\n",
" <td>27u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>enco</td>\n",
" <td>žibrik</td>\n",
" <td>7.78</td>\n",
" <td>7.34</td>\n",
" <td>0.44</td>\n",
" <td>auralin</td>\n",
" <td>55u</td>\n",
" <td>68u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>camilla</td>\n",
" <td>zaitseva</td>\n",
" <td>7.53</td>\n",
" <td>7.13</td>\n",
" <td>0.40</td>\n",
" <td>auralin</td>\n",
" <td>28u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>tekla</td>\n",
" <td>walczak</td>\n",
" <td>7.61</td>\n",
" <td>7.29</td>\n",
" <td>0.32</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>39u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>brancaleone</td>\n",
" <td>russo</td>\n",
" <td>8.61</td>\n",
" <td>8.18</td>\n",
" <td>0.43</td>\n",
" <td>auralin</td>\n",
" <td>53u</td>\n",
" <td>60u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>isac</td>\n",
" <td>berg</td>\n",
" <td>9.68</td>\n",
" <td>9.29</td>\n",
" <td>0.39</td>\n",
" <td>auralin</td>\n",
" <td>31u</td>\n",
" <td>41u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>clinton</td>\n",
" <td>miller</td>\n",
" <td>7.79</td>\n",
" <td>7.40</td>\n",
" <td>0.39</td>\n",
" <td>auralin</td>\n",
" <td>42u</td>\n",
" <td>51u</td>\n",
" <td>throat irritation</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>eugene</td>\n",
" <td>mironov</td>\n",
" <td>7.81</td>\n",
" <td>7.48</td>\n",
" <td>0.33</td>\n",
" <td>auralin</td>\n",
" <td>42u</td>\n",
" <td>49u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>szilveszter</td>\n",
" <td>totth</td>\n",
" <td>7.70</td>\n",
" <td>7.38</td>\n",
" <td>0.32</td>\n",
" <td>auralin</td>\n",
" <td>35u</td>\n",
" <td>39u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>alexander</td>\n",
" <td>mathiesen</td>\n",
" <td>7.96</td>\n",
" <td>7.55</td>\n",
" <td>0.41</td>\n",
" <td>auralin</td>\n",
" <td>47u</td>\n",
" <td>58u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>chỉ</td>\n",
" <td>lâm</td>\n",
" <td>7.68</td>\n",
" <td>7.24</td>\n",
" <td>0.44</td>\n",
" <td>auralin</td>\n",
" <td>45u</td>\n",
" <td>48u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>władysław</td>\n",
" <td>wieczorek</td>\n",
" <td>7.92</td>\n",
" <td>7.47</td>\n",
" <td>0.45</td>\n",
" <td>auralin</td>\n",
" <td>24u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>kristján</td>\n",
" <td>ingason</td>\n",
" <td>7.92</td>\n",
" <td>7.57</td>\n",
" <td>0.35</td>\n",
" <td>auralin</td>\n",
" <td>44u</td>\n",
" <td>55u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>marija</td>\n",
" <td>grubišić</td>\n",
" <td>7.53</td>\n",
" <td>7.15</td>\n",
" <td>0.38</td>\n",
" <td>auralin</td>\n",
" <td>37u</td>\n",
" <td>43u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>sauli</td>\n",
" <td>koivuniemi</td>\n",
" <td>7.67</td>\n",
" <td>7.37</td>\n",
" <td>0.30</td>\n",
" <td>auralin</td>\n",
" <td>43u</td>\n",
" <td>47u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>mariana</td>\n",
" <td>souza</td>\n",
" <td>7.86</td>\n",
" <td>7.51</td>\n",
" <td>0.35</td>\n",
" <td>auralin</td>\n",
" <td>36u</td>\n",
" <td>42u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>kristoffer</td>\n",
" <td>martinsen</td>\n",
" <td>9.18</td>\n",
" <td>8.64</td>\n",
" <td>0.54</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>mỹ</td>\n",
" <td>quynh</td>\n",
" <td>7.61</td>\n",
" <td>7.16</td>\n",
" <td>0.45</td>\n",
" <td>auralin</td>\n",
" <td>57u</td>\n",
" <td>64u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>oles</td>\n",
" <td>zhdanov</td>\n",
" <td>7.52</td>\n",
" <td>7.11</td>\n",
" <td>0.41</td>\n",
" <td>auralin</td>\n",
" <td>54u</td>\n",
" <td>67u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>triana.</td>\n",
" <td>terrazas</td>\n",
" <td>7.71</td>\n",
" <td>7.34</td>\n",
" <td>0.37</td>\n",
" <td>auralin</td>\n",
" <td>34u</td>\n",
" <td>42u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>gabryŝ</td>\n",
" <td>tomaszewski</td>\n",
" <td>7.87</td>\n",
" <td>7.47</td>\n",
" <td>0.40</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>leixandre</td>\n",
" <td>alanis</td>\n",
" <td>7.74</td>\n",
" <td>7.32</td>\n",
" <td>0.42</td>\n",
" <td>auralin</td>\n",
" <td>61u</td>\n",
" <td>67u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>onyekachukwu</td>\n",
" <td>obinna</td>\n",
" <td>7.58</td>\n",
" <td>7.12</td>\n",
" <td>0.46</td>\n",
" <td>auralin</td>\n",
" <td>37u</td>\n",
" <td>46u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>320</th>\n",
" <td>jane</td>\n",
" <td>citizen</td>\n",
" <td>7.98</td>\n",
" <td>7.60</td>\n",
" <td>0.38</td>\n",
" <td>novodra</td>\n",
" <td>37u</td>\n",
" <td>38u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>321</th>\n",
" <td>angela</td>\n",
" <td>lavrentyev</td>\n",
" <td>7.61</td>\n",
" <td>7.14</td>\n",
" <td>0.47</td>\n",
" <td>novodra</td>\n",
" <td>28u</td>\n",
" <td>24u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>322</th>\n",
" <td>edelma</td>\n",
" <td>villalpando</td>\n",
" <td>7.99</td>\n",
" <td>7.56</td>\n",
" <td>0.43</td>\n",
" <td>novodra</td>\n",
" <td>24u</td>\n",
" <td>26u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>323</th>\n",
" <td>annika</td>\n",
" <td>vaara</td>\n",
" <td>7.73</td>\n",
" <td>7.34</td>\n",
" <td>0.39</td>\n",
" <td>novodra</td>\n",
" <td>20u</td>\n",
" <td>21u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>324</th>\n",
" <td>chiho</td>\n",
" <td>higa</td>\n",
" <td>7.71</td>\n",
" <td>7.30</td>\n",
" <td>0.41</td>\n",
" <td>novodra</td>\n",
" <td>46u</td>\n",
" <td>46u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>325</th>\n",
" <td>beatrycze</td>\n",
" <td>woźniak</td>\n",
" <td>7.54</td>\n",
" <td>7.17</td>\n",
" <td>0.37</td>\n",
" <td>novodra</td>\n",
" <td>26u</td>\n",
" <td>27u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>326</th>\n",
" <td>miłosław</td>\n",
" <td>wiśniewski</td>\n",
" <td>7.51</td>\n",
" <td>7.08</td>\n",
" <td>0.43</td>\n",
" <td>novodra</td>\n",
" <td>34u</td>\n",
" <td>33u</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>327</th>\n",
" <td>firenze</td>\n",
" <td>fodor</td>\n",
" <td>7.89</td>\n",
" <td>7.55</td>\n",
" <td>0.34</td>\n",
" <td>novodra</td>\n",
" <td>30u</td>\n",
" <td>35u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>328</th>\n",
" <td>zoe</td>\n",
" <td>wellish</td>\n",
" <td>7.71</td>\n",
" <td>7.30</td>\n",
" <td>0.41</td>\n",
" <td>novodra</td>\n",
" <td>33u</td>\n",
" <td>33u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>329</th>\n",
" <td>una</td>\n",
" <td>traustadóttir</td>\n",
" <td>8.00</td>\n",
" <td>7.50</td>\n",
" <td>0.50</td>\n",
" <td>novodra</td>\n",
" <td>35u</td>\n",
" <td>34u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>330</th>\n",
" <td>luboš</td>\n",
" <td>pecha</td>\n",
" <td>7.79</td>\n",
" <td>7.45</td>\n",
" <td>0.34</td>\n",
" <td>novodra</td>\n",
" <td>30u</td>\n",
" <td>27u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>331</th>\n",
" <td>meaza</td>\n",
" <td>brhane</td>\n",
" <td>7.70</td>\n",
" <td>7.36</td>\n",
" <td>0.34</td>\n",
" <td>novodra</td>\n",
" <td>37u</td>\n",
" <td>41u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>332</th>\n",
" <td>adlan</td>\n",
" <td>shishani</td>\n",
" <td>7.84</td>\n",
" <td>7.37</td>\n",
" <td>0.47</td>\n",
" <td>novodra</td>\n",
" <td>43u</td>\n",
" <td>40u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>333</th>\n",
" <td>sofia</td>\n",
" <td>hermansen</td>\n",
" <td>8.90</td>\n",
" <td>8.57</td>\n",
" <td>0.33</td>\n",
" <td>novodra</td>\n",
" <td>34u</td>\n",
" <td>34u</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>334</th>\n",
" <td>guðni</td>\n",
" <td>heimisson</td>\n",
" <td>7.64</td>\n",
" <td>7.24</td>\n",
" <td>0.40</td>\n",
" <td>novodra</td>\n",
" <td>40u</td>\n",
" <td>36u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>335</th>\n",
" <td>eufemio</td>\n",
" <td>rosario</td>\n",
" <td>7.54</td>\n",
" <td>7.26</td>\n",
" <td>0.28</td>\n",
" <td>novodra</td>\n",
" <td>37u</td>\n",
" <td>40u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>336</th>\n",
" <td>dalmacia</td>\n",
" <td>madrid</td>\n",
" <td>7.67</td>\n",
" <td>7.21</td>\n",
" <td>0.46</td>\n",
" <td>novodra</td>\n",
" <td>26u</td>\n",
" <td>23u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>337</th>\n",
" <td>daimy</td>\n",
" <td>tromp</td>\n",
" <td>9.41</td>\n",
" <td>8.94</td>\n",
" <td>0.47</td>\n",
" <td>novodra</td>\n",
" <td>40u</td>\n",
" <td>45u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>338</th>\n",
" <td>jeremy</td>\n",
" <td>montagu</td>\n",
" <td>7.68</td>\n",
" <td>7.36</td>\n",
" <td>0.32</td>\n",
" <td>novodra</td>\n",
" <td>52u</td>\n",
" <td>52u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>339</th>\n",
" <td>nebechi</td>\n",
" <td>ekechukwu</td>\n",
" <td>7.78</td>\n",
" <td>7.39</td>\n",
" <td>0.39</td>\n",
" <td>novodra</td>\n",
" <td>37u</td>\n",
" <td>39u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>340</th>\n",
" <td>satsita</td>\n",
" <td>batukayev</td>\n",
" <td>7.63</td>\n",
" <td>7.25</td>\n",
" <td>0.38</td>\n",
" <td>novodra</td>\n",
" <td>42u</td>\n",
" <td>42u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>341</th>\n",
" <td>timothy</td>\n",
" <td>cotton</td>\n",
" <td>7.92</td>\n",
" <td>7.52</td>\n",
" <td>0.40</td>\n",
" <td>novodra</td>\n",
" <td>26u</td>\n",
" <td>25u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>342</th>\n",
" <td>bjørnar</td>\n",
" <td>nilsen</td>\n",
" <td>7.99</td>\n",
" <td>7.70</td>\n",
" <td>0.29</td>\n",
" <td>novodra</td>\n",
" <td>36u</td>\n",
" <td>33u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>343</th>\n",
" <td>borna</td>\n",
" <td>šlezinger</td>\n",
" <td>7.55</td>\n",
" <td>7.18</td>\n",
" <td>0.37</td>\n",
" <td>novodra</td>\n",
" <td>42u</td>\n",
" <td>41u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>344</th>\n",
" <td>mary</td>\n",
" <td>adams</td>\n",
" <td>7.65</td>\n",
" <td>7.26</td>\n",
" <td>0.39</td>\n",
" <td>novodra</td>\n",
" <td>32u</td>\n",
" <td>33u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>345</th>\n",
" <td>christopher</td>\n",
" <td>woodward</td>\n",
" <td>7.51</td>\n",
" <td>7.06</td>\n",
" <td>0.45</td>\n",
" <td>novodra</td>\n",
" <td>55u</td>\n",
" <td>51u</td>\n",
" <td>nausea</td>\n",
" </tr>\n",
" <tr>\n",
" <th>346</th>\n",
" <td>maret</td>\n",
" <td>sultygov</td>\n",
" <td>7.67</td>\n",
" <td>7.30</td>\n",
" <td>0.37</td>\n",
" <td>novodra</td>\n",
" <td>26u</td>\n",
" <td>23u</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>347</th>\n",
" <td>lixue</td>\n",
" <td>hsueh</td>\n",
" <td>9.21</td>\n",
" <td>8.80</td>\n",
" <td>0.41</td>\n",
" <td>novodra</td>\n",
" <td>22u</td>\n",
" <td>23u</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" <tr>\n",
" <th>348</th>\n",
" <td>jakob</td>\n",
" <td>jakobsen</td>\n",
" <td>7.96</td>\n",
" <td>7.51</td>\n",
" <td>0.45</td>\n",
" <td>novodra</td>\n",
" <td>28u</td>\n",
" <td>26u</td>\n",
" <td>hypoglycemia</td>\n",
" </tr>\n",
" <tr>\n",
" <th>349</th>\n",
" <td>berta</td>\n",
" <td>napolitani</td>\n",
" <td>7.68</td>\n",
" <td>7.21</td>\n",
" <td>0.47</td>\n",
" <td>novodra</td>\n",
" <td>42u</td>\n",
" <td>44u</td>\n",
" <td>injection site discomfort</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>350 rows × 9 columns</p>\n",
"</div>"
],
"text/plain": [
" given_name surname hba1c_start hba1c_end hba1c_change \\\n",
"0 veronika jindrová 7.63 7.20 0.43 \n",
"1 skye gormanston 7.97 7.62 0.35 \n",
"2 sophia haugen 7.65 7.27 0.38 \n",
"3 eddie archer 7.89 7.55 0.34 \n",
"4 asia woźniak 7.76 7.37 0.39 \n",
"5 joseph day 7.70 7.19 0.51 \n",
"6 roxanne andreyeva 9.54 9.14 0.40 \n",
"7 simone baumgaertner 7.74 7.30 0.44 \n",
"8 enco žibrik 7.78 7.34 0.44 \n",
"9 camilla zaitseva 7.53 7.13 0.40 \n",
"10 tekla walczak 7.61 7.29 0.32 \n",
"11 brancaleone russo 8.61 8.18 0.43 \n",
"12 isac berg 9.68 9.29 0.39 \n",
"13 clinton miller 7.79 7.40 0.39 \n",
"14 eugene mironov 7.81 7.48 0.33 \n",
"15 szilveszter totth 7.70 7.38 0.32 \n",
"16 alexander mathiesen 7.96 7.55 0.41 \n",
"17 chỉ lâm 7.68 7.24 0.44 \n",
"18 władysław wieczorek 7.92 7.47 0.45 \n",
"19 kristján ingason 7.92 7.57 0.35 \n",
"20 marija grubišić 7.53 7.15 0.38 \n",
"21 sauli koivuniemi 7.67 7.37 0.30 \n",
"22 mariana souza 7.86 7.51 0.35 \n",
"23 kristoffer martinsen 9.18 8.64 0.54 \n",
"24 mỹ quynh 7.61 7.16 0.45 \n",
"25 oles zhdanov 7.52 7.11 0.41 \n",
"26 triana. terrazas 7.71 7.34 0.37 \n",
"27 gabryŝ tomaszewski 7.87 7.47 0.40 \n",
"28 leixandre alanis 7.74 7.32 0.42 \n",
"29 onyekachukwu obinna 7.58 7.12 0.46 \n",
".. ... ... ... ... ... \n",
"320 jane citizen 7.98 7.60 0.38 \n",
"321 angela lavrentyev 7.61 7.14 0.47 \n",
"322 edelma villalpando 7.99 7.56 0.43 \n",
"323 annika vaara 7.73 7.34 0.39 \n",
"324 chiho higa 7.71 7.30 0.41 \n",
"325 beatrycze woźniak 7.54 7.17 0.37 \n",
"326 miłosław wiśniewski 7.51 7.08 0.43 \n",
"327 firenze fodor 7.89 7.55 0.34 \n",
"328 zoe wellish 7.71 7.30 0.41 \n",
"329 una traustadóttir 8.00 7.50 0.50 \n",
"330 luboš pecha 7.79 7.45 0.34 \n",
"331 meaza brhane 7.70 7.36 0.34 \n",
"332 adlan shishani 7.84 7.37 0.47 \n",
"333 sofia hermansen 8.90 8.57 0.33 \n",
"334 guðni heimisson 7.64 7.24 0.40 \n",
"335 eufemio rosario 7.54 7.26 0.28 \n",
"336 dalmacia madrid 7.67 7.21 0.46 \n",
"337 daimy tromp 9.41 8.94 0.47 \n",
"338 jeremy montagu 7.68 7.36 0.32 \n",
"339 nebechi ekechukwu 7.78 7.39 0.39 \n",
"340 satsita batukayev 7.63 7.25 0.38 \n",
"341 timothy cotton 7.92 7.52 0.40 \n",
"342 bjørnar nilsen 7.99 7.70 0.29 \n",
"343 borna šlezinger 7.55 7.18 0.37 \n",
"344 mary adams 7.65 7.26 0.39 \n",
"345 christopher woodward 7.51 7.06 0.45 \n",
"346 maret sultygov 7.67 7.30 0.37 \n",
"347 lixue hsueh 9.21 8.80 0.41 \n",
"348 jakob jakobsen 7.96 7.51 0.45 \n",
"349 berta napolitani 7.68 7.21 0.47 \n",
"\n",
" treatment dose_start dose_end adverse_reaction \n",
"0 auralin 41u 48u NaN \n",
"1 auralin 33u 36u NaN \n",
"2 auralin 37u 42u NaN \n",
"3 auralin 31u 38u NaN \n",
"4 auralin 30u 36u NaN \n",
"5 auralin 29u 36u hypoglycemia \n",
"6 auralin 29u 38u NaN \n",
"7 auralin 27u 37u NaN \n",
"8 auralin 55u 68u NaN \n",
"9 auralin 28u 37u NaN \n",
"10 auralin 29u 39u NaN \n",
"11 auralin 53u 60u NaN \n",
"12 auralin 31u 41u NaN \n",
"13 auralin 42u 51u throat irritation \n",
"14 auralin 42u 49u NaN \n",
"15 auralin 35u 39u NaN \n",
"16 auralin 47u 58u NaN \n",
"17 auralin 45u 48u NaN \n",
"18 auralin 24u 37u NaN \n",
"19 auralin 44u 55u NaN \n",
"20 auralin 37u 43u NaN \n",
"21 auralin 43u 47u NaN \n",
"22 auralin 36u 42u NaN \n",
"23 auralin 29u 37u NaN \n",
"24 auralin 57u 64u NaN \n",
"25 auralin 54u 67u NaN \n",
"26 auralin 34u 42u NaN \n",
"27 auralin 29u 37u NaN \n",
"28 auralin 61u 67u NaN \n",
"29 auralin 37u 46u NaN \n",
".. ... ... ... ... \n",
"320 novodra 37u 38u NaN \n",
"321 novodra 28u 24u NaN \n",
"322 novodra 24u 26u NaN \n",
"323 novodra 20u 21u NaN \n",
"324 novodra 46u 46u NaN \n",
"325 novodra 26u 27u NaN \n",
"326 novodra 34u 33u injection site discomfort \n",
"327 novodra 30u 35u NaN \n",
"328 novodra 33u 33u NaN \n",
"329 novodra 35u 34u NaN \n",
"330 novodra 30u 27u NaN \n",
"331 novodra 37u 41u NaN \n",
"332 novodra 43u 40u NaN \n",
"333 novodra 34u 34u injection site discomfort \n",
"334 novodra 40u 36u NaN \n",
"335 novodra 37u 40u NaN \n",
"336 novodra 26u 23u NaN \n",
"337 novodra 40u 45u NaN \n",
"338 novodra 52u 52u NaN \n",
"339 novodra 37u 39u NaN \n",
"340 novodra 42u 42u NaN \n",
"341 novodra 26u 25u NaN \n",
"342 novodra 36u 33u NaN \n",
"343 novodra 42u 41u NaN \n",
"344 novodra 32u 33u NaN \n",
"345 novodra 55u 51u nausea \n",
"346 novodra 26u 23u NaN \n",
"347 novodra 22u 23u injection site discomfort \n",
"348 novodra 28u 26u hypoglycemia \n",
"349 novodra 42u 44u injection site discomfort \n",
"\n",
"[350 rows x 9 columns]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"treatments_clean"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Given name and surname columns in `patients` table duplicated in `treatments` and `adverse_reactions` tables and Lowercase given names and surnames"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Adverse reactions table is no longer needed so ignore that part. Isolate the patient ID and names in the `patients` table, then convert these names to lower case to join with `treatments`. Then drop the given name and surname columns in the treatments table (so these being lowercase isn't an issue anymore)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.6/site-packages/pandas/core/generic.py:4405: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
" self[name] = value\n"
]
}
],
"source": [
"id_names = patients_clean[['patient_id', 'given_name', 'surname']]\n",
"id_names.given_name = id_names.given_name.str.lower()\n",
"id_names.surname = id_names.surname.str.lower()\n",
"treatments_clean = pd.merge(treatments_clean, id_names, on=['given_name', 'surname'])\n",
"treatments_clean = treatments_clean.drop(['given_name', 'surname'], axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>hba1c_start</th>\n",
" <th>hba1c_end</th>\n",
" <th>hba1c_change</th>\n",
" <th>treatment</th>\n",
" <th>dose_start</th>\n",
" <th>dose_end</th>\n",
" <th>adverse_reaction</th>\n",
" <th>patient_id</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>7.63</td>\n",
" <td>7.20</td>\n",
" <td>0.43</td>\n",
" <td>auralin</td>\n",
" <td>41u</td>\n",
" <td>48u</td>\n",
" <td>NaN</td>\n",
" <td>225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>7.97</td>\n",
" <td>7.62</td>\n",
" <td>0.35</td>\n",
" <td>auralin</td>\n",
" <td>33u</td>\n",
" <td>36u</td>\n",
" <td>NaN</td>\n",
" <td>242</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>7.65</td>\n",
" <td>7.27</td>\n",
" <td>0.38</td>\n",
" <td>auralin</td>\n",
" <td>37u</td>\n",
" <td>42u</td>\n",
" <td>NaN</td>\n",
" <td>345</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>7.89</td>\n",
" <td>7.55</td>\n",
" <td>0.34</td>\n",
" <td>auralin</td>\n",
" <td>31u</td>\n",
" <td>38u</td>\n",
" <td>NaN</td>\n",
" <td>276</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>7.76</td>\n",
" <td>7.37</td>\n",
" <td>0.39</td>\n",
" <td>auralin</td>\n",
" <td>30u</td>\n",
" <td>36u</td>\n",
" <td>NaN</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>7.70</td>\n",
" <td>7.19</td>\n",
" <td>0.51</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>36u</td>\n",
" <td>hypoglycemia</td>\n",
" <td>70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>7.70</td>\n",
" <td>7.19</td>\n",
" <td>0.51</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>36u</td>\n",
" <td>hypoglycemia</td>\n",
" <td>70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>9.54</td>\n",
" <td>9.14</td>\n",
" <td>0.40</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>38u</td>\n",
" <td>NaN</td>\n",
" <td>18</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>7.74</td>\n",
" <td>7.30</td>\n",
" <td>0.44</td>\n",
" <td>auralin</td>\n",
" <td>27u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" <td>424</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>7.78</td>\n",
" <td>7.34</td>\n",
" <td>0.44</td>\n",
" <td>auralin</td>\n",
" <td>55u</td>\n",
" <td>68u</td>\n",
" <td>NaN</td>\n",
" <td>292</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>7.53</td>\n",
" <td>7.13</td>\n",
" <td>0.40</td>\n",
" <td>auralin</td>\n",
" <td>28u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" <td>211</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>7.61</td>\n",
" <td>7.29</td>\n",
" <td>0.32</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>39u</td>\n",
" <td>NaN</td>\n",
" <td>133</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>8.61</td>\n",
" <td>8.18</td>\n",
" <td>0.43</td>\n",
" <td>auralin</td>\n",
" <td>53u</td>\n",
" <td>60u</td>\n",
" <td>NaN</td>\n",
" <td>316</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>9.68</td>\n",
" <td>9.29</td>\n",
" <td>0.39</td>\n",
" <td>auralin</td>\n",
" <td>31u</td>\n",
" <td>41u</td>\n",
" <td>NaN</td>\n",
" <td>101</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>7.79</td>\n",
" <td>7.40</td>\n",
" <td>0.39</td>\n",
" <td>auralin</td>\n",
" <td>42u</td>\n",
" <td>51u</td>\n",
" <td>throat irritation</td>\n",
" <td>451</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>7.81</td>\n",
" <td>7.48</td>\n",
" <td>0.33</td>\n",
" <td>auralin</td>\n",
" <td>42u</td>\n",
" <td>49u</td>\n",
" <td>NaN</td>\n",
" <td>335</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>7.70</td>\n",
" <td>7.38</td>\n",
" <td>0.32</td>\n",
" <td>auralin</td>\n",
" <td>35u</td>\n",
" <td>39u</td>\n",
" <td>NaN</td>\n",
" <td>389</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>7.96</td>\n",
" <td>7.55</td>\n",
" <td>0.41</td>\n",
" <td>auralin</td>\n",
" <td>47u</td>\n",
" <td>58u</td>\n",
" <td>NaN</td>\n",
" <td>71</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>7.68</td>\n",
" <td>7.24</td>\n",
" <td>0.44</td>\n",
" <td>auralin</td>\n",
" <td>45u</td>\n",
" <td>48u</td>\n",
" <td>NaN</td>\n",
" <td>297</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>7.92</td>\n",
" <td>7.47</td>\n",
" <td>0.45</td>\n",
" <td>auralin</td>\n",
" <td>24u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" <td>188</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>7.92</td>\n",
" <td>7.57</td>\n",
" <td>0.35</td>\n",
" <td>auralin</td>\n",
" <td>44u</td>\n",
" <td>55u</td>\n",
" <td>NaN</td>\n",
" <td>282</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>7.53</td>\n",
" <td>7.15</td>\n",
" <td>0.38</td>\n",
" <td>auralin</td>\n",
" <td>37u</td>\n",
" <td>43u</td>\n",
" <td>NaN</td>\n",
" <td>174</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>7.67</td>\n",
" <td>7.37</td>\n",
" <td>0.30</td>\n",
" <td>auralin</td>\n",
" <td>43u</td>\n",
" <td>47u</td>\n",
" <td>NaN</td>\n",
" <td>146</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>7.86</td>\n",
" <td>7.51</td>\n",
" <td>0.35</td>\n",
" <td>auralin</td>\n",
" <td>36u</td>\n",
" <td>42u</td>\n",
" <td>NaN</td>\n",
" <td>35</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>9.18</td>\n",
" <td>8.64</td>\n",
" <td>0.54</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" <td>350</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>7.61</td>\n",
" <td>7.16</td>\n",
" <td>0.45</td>\n",
" <td>auralin</td>\n",
" <td>57u</td>\n",
" <td>64u</td>\n",
" <td>NaN</td>\n",
" <td>220</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>7.52</td>\n",
" <td>7.11</td>\n",
" <td>0.41</td>\n",
" <td>auralin</td>\n",
" <td>54u</td>\n",
" <td>67u</td>\n",
" <td>NaN</td>\n",
" <td>102</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>7.71</td>\n",
" <td>7.34</td>\n",
" <td>0.37</td>\n",
" <td>auralin</td>\n",
" <td>34u</td>\n",
" <td>42u</td>\n",
" <td>NaN</td>\n",
" <td>181</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>7.87</td>\n",
" <td>7.47</td>\n",
" <td>0.40</td>\n",
" <td>auralin</td>\n",
" <td>29u</td>\n",
" <td>37u</td>\n",
" <td>NaN</td>\n",
" <td>466</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>7.74</td>\n",
" <td>7.32</td>\n",
" <td>0.42</td>\n",
" <td>auralin</td>\n",
" <td>61u</td>\n",
" <td>67u</td>\n",
" <td>NaN</td>\n",
" <td>205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>319</th>\n",
" <td>7.98</td>\n",
" <td>7.60</td>\n",
" <td>0.38</td>\n",
" <td>novodra</td>\n",
" <td>37u</td>\n",
" <td>38u</td>\n",
" <td>NaN</td>\n",
" <td>187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>320</th>\n",
" <td>7.61</td>\n",
" <td>7.14</td>\n",
" <td>0.47</td>\n",
" <td>novodra</td>\n",
" <td>28u</td>\n",
" <td>24u</td>\n",
" <td>NaN</td>\n",
" <td>234</td>\n",
" </tr>\n",
" <tr>\n",
" <th>321</th>\n",
" <td>7.99</td>\n",
" <td>7.56</td>\n",
" <td>0.43</td>\n",
" <td>novodra</td>\n",
" <td>24u</td>\n",
" <td>26u</td>\n",
" <td>NaN</td>\n",
" <td>479</td>\n",
" </tr>\n",
" <tr>\n",
" <th>322</th>\n",
" <td>7.73</td>\n",
" <td>7.34</td>\n",
" <td>0.39</td>\n",
" <td>novodra</td>\n",
" <td>20u</td>\n",
" <td>21u</td>\n",
" <td>NaN</td>\n",
" <td>49</td>\n",
" </tr>\n",
" <tr>\n",
" <th>323</th>\n",
" <td>7.71</td>\n",
" <td>7.30</td>\n",
" <td>0.41</td>\n",
" <td>novodra</td>\n",
" <td>46u</td>\n",
" <td>46u</td>\n",
" <td>NaN</td>\n",
" <td>356</td>\n",
" </tr>\n",
" <tr>\n",
" <th>324</th>\n",
" <td>7.54</td>\n",
" <td>7.17</td>\n",
" <td>0.37</td>\n",
" <td>novodra</td>\n",
" <td>26u</td>\n",
" <td>27u</td>\n",
" <td>NaN</td>\n",
" <td>208</td>\n",
" </tr>\n",
" <tr>\n",
" <th>325</th>\n",
" <td>7.51</td>\n",
" <td>7.08</td>\n",
" <td>0.43</td>\n",
" <td>novodra</td>\n",
" <td>34u</td>\n",
" <td>33u</td>\n",
" <td>injection site discomfort</td>\n",
" <td>373</td>\n",
" </tr>\n",
" <tr>\n",
" <th>326</th>\n",
" <td>7.89</td>\n",
" <td>7.55</td>\n",
" <td>0.34</td>\n",
" <td>novodra</td>\n",
" <td>30u</td>\n",
" <td>35u</td>\n",
" <td>NaN</td>\n",
" <td>63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>327</th>\n",
" <td>7.71</td>\n",
" <td>7.30</td>\n",
" <td>0.41</td>\n",
" <td>novodra</td>\n",
" <td>33u</td>\n",
" <td>33u</td>\n",
" <td>NaN</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>328</th>\n",
" <td>8.00</td>\n",
" <td>7.50</td>\n",
" <td>0.50</td>\n",
" <td>novodra</td>\n",
" <td>35u</td>\n",
" <td>34u</td>\n",
" <td>NaN</td>\n",
" <td>291</td>\n",
" </tr>\n",
" <tr>\n",
" <th>329</th>\n",
" <td>7.79</td>\n",
" <td>7.45</td>\n",
" <td>0.34</td>\n",
" <td>novodra</td>\n",
" <td>30u</td>\n",
" <td>27u</td>\n",
" <td>NaN</td>\n",
" <td>363</td>\n",
" </tr>\n",
" <tr>\n",
" <th>330</th>\n",
" <td>7.70</td>\n",
" <td>7.36</td>\n",
" <td>0.34</td>\n",
" <td>novodra</td>\n",
" <td>37u</td>\n",
" <td>41u</td>\n",
" <td>NaN</td>\n",
" <td>465</td>\n",
" </tr>\n",
" <tr>\n",
" <th>331</th>\n",
" <td>7.84</td>\n",
" <td>7.37</td>\n",
" <td>0.47</td>\n",
" <td>novodra</td>\n",
" <td>43u</td>\n",
" <td>40u</td>\n",
" <td>NaN</td>\n",
" <td>421</td>\n",
" </tr>\n",
" <tr>\n",
" <th>332</th>\n",
" <td>8.90</td>\n",
" <td>8.57</td>\n",
" <td>0.33</td>\n",
" <td>novodra</td>\n",
" <td>34u</td>\n",
" <td>34u</td>\n",
" <td>injection site discomfort</td>\n",
" <td>376</td>\n",
" </tr>\n",
" <tr>\n",
" <th>333</th>\n",
" <td>7.64</td>\n",
" <td>7.24</td>\n",
" <td>0.40</td>\n",
" <td>novodra</td>\n",
" <td>40u</td>\n",
" <td>36u</td>\n",
" <td>NaN</td>\n",
" <td>463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>334</th>\n",
" <td>7.54</td>\n",
" <td>7.26</td>\n",
" <td>0.28</td>\n",
" <td>novodra</td>\n",
" <td>37u</td>\n",
" <td>40u</td>\n",
" <td>NaN</td>\n",
" <td>81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>335</th>\n",
" <td>7.67</td>\n",
" <td>7.21</td>\n",
" <td>0.46</td>\n",
" <td>novodra</td>\n",
" <td>26u</td>\n",
" <td>23u</td>\n",
" <td>NaN</td>\n",
" <td>322</td>\n",
" </tr>\n",
" <tr>\n",
" <th>336</th>\n",
" <td>9.41</td>\n",
" <td>8.94</td>\n",
" <td>0.47</td>\n",
" <td>novodra</td>\n",
" <td>40u</td>\n",
" <td>45u</td>\n",
" <td>NaN</td>\n",
" <td>392</td>\n",
" </tr>\n",
" <tr>\n",
" <th>337</th>\n",
" <td>7.68</td>\n",
" <td>7.36</td>\n",
" <td>0.32</td>\n",
" <td>novodra</td>\n",
" <td>52u</td>\n",
" <td>52u</td>\n",
" <td>NaN</td>\n",
" <td>262</td>\n",
" </tr>\n",
" <tr>\n",
" <th>338</th>\n",
" <td>7.78</td>\n",
" <td>7.39</td>\n",
" <td>0.39</td>\n",
" <td>novodra</td>\n",
" <td>37u</td>\n",
" <td>39u</td>\n",
" <td>NaN</td>\n",
" <td>68</td>\n",
" </tr>\n",
" <tr>\n",
" <th>339</th>\n",
" <td>7.63</td>\n",
" <td>7.25</td>\n",
" <td>0.38</td>\n",
" <td>novodra</td>\n",
" <td>42u</td>\n",
" <td>42u</td>\n",
" <td>NaN</td>\n",
" <td>152</td>\n",
" </tr>\n",
" <tr>\n",
" <th>340</th>\n",
" <td>7.92</td>\n",
" <td>7.52</td>\n",
" <td>0.40</td>\n",
" <td>novodra</td>\n",
" <td>26u</td>\n",
" <td>25u</td>\n",
" <td>NaN</td>\n",
" <td>431</td>\n",
" </tr>\n",
" <tr>\n",
" <th>341</th>\n",
" <td>7.99</td>\n",
" <td>7.70</td>\n",
" <td>0.29</td>\n",
" <td>novodra</td>\n",
" <td>36u</td>\n",
" <td>33u</td>\n",
" <td>NaN</td>\n",
" <td>450</td>\n",
" </tr>\n",
" <tr>\n",
" <th>342</th>\n",
" <td>7.55</td>\n",
" <td>7.18</td>\n",
" <td>0.37</td>\n",
" <td>novodra</td>\n",
" <td>42u</td>\n",
" <td>41u</td>\n",
" <td>NaN</td>\n",
" <td>194</td>\n",
" </tr>\n",
" <tr>\n",
" <th>343</th>\n",
" <td>7.65</td>\n",
" <td>7.26</td>\n",
" <td>0.39</td>\n",
" <td>novodra</td>\n",
" <td>32u</td>\n",
" <td>33u</td>\n",
" <td>NaN</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>344</th>\n",
" <td>7.51</td>\n",
" <td>7.06</td>\n",
" <td>0.45</td>\n",
" <td>novodra</td>\n",
" <td>55u</td>\n",
" <td>51u</td>\n",
" <td>nausea</td>\n",
" <td>153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>345</th>\n",
" <td>7.67</td>\n",
" <td>7.30</td>\n",
" <td>0.37</td>\n",
" <td>novodra</td>\n",
" <td>26u</td>\n",
" <td>23u</td>\n",
" <td>NaN</td>\n",
" <td>420</td>\n",
" </tr>\n",
" <tr>\n",
" <th>346</th>\n",
" <td>9.21</td>\n",
" <td>8.80</td>\n",
" <td>0.41</td>\n",
" <td>novodra</td>\n",
" <td>22u</td>\n",
" <td>23u</td>\n",
" <td>injection site discomfort</td>\n",
" <td>336</td>\n",
" </tr>\n",
" <tr>\n",
" <th>347</th>\n",
" <td>7.96</td>\n",
" <td>7.51</td>\n",
" <td>0.45</td>\n",
" <td>novodra</td>\n",
" <td>28u</td>\n",
" <td>26u</td>\n",
" <td>hypoglycemia</td>\n",
" <td>25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>348</th>\n",
" <td>7.68</td>\n",
" <td>7.21</td>\n",
" <td>0.47</td>\n",
" <td>novodra</td>\n",
" <td>42u</td>\n",
" <td>44u</td>\n",
" <td>injection site discomfort</td>\n",
" <td>477</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>349 rows × 8 columns</p>\n",
"</div>"
],
"text/plain": [
" hba1c_start hba1c_end hba1c_change treatment dose_start dose_end \\\n",
"0 7.63 7.20 0.43 auralin 41u 48u \n",
"1 7.97 7.62 0.35 auralin 33u 36u \n",
"2 7.65 7.27 0.38 auralin 37u 42u \n",
"3 7.89 7.55 0.34 auralin 31u 38u \n",
"4 7.76 7.37 0.39 auralin 30u 36u \n",
"5 7.70 7.19 0.51 auralin 29u 36u \n",
"6 7.70 7.19 0.51 auralin 29u 36u \n",
"7 9.54 9.14 0.40 auralin 29u 38u \n",
"8 7.74 7.30 0.44 auralin 27u 37u \n",
"9 7.78 7.34 0.44 auralin 55u 68u \n",
"10 7.53 7.13 0.40 auralin 28u 37u \n",
"11 7.61 7.29 0.32 auralin 29u 39u \n",
"12 8.61 8.18 0.43 auralin 53u 60u \n",
"13 9.68 9.29 0.39 auralin 31u 41u \n",
"14 7.79 7.40 0.39 auralin 42u 51u \n",
"15 7.81 7.48 0.33 auralin 42u 49u \n",
"16 7.70 7.38 0.32 auralin 35u 39u \n",
"17 7.96 7.55 0.41 auralin 47u 58u \n",
"18 7.68 7.24 0.44 auralin 45u 48u \n",
"19 7.92 7.47 0.45 auralin 24u 37u \n",
"20 7.92 7.57 0.35 auralin 44u 55u \n",
"21 7.53 7.15 0.38 auralin 37u 43u \n",
"22 7.67 7.37 0.30 auralin 43u 47u \n",
"23 7.86 7.51 0.35 auralin 36u 42u \n",
"24 9.18 8.64 0.54 auralin 29u 37u \n",
"25 7.61 7.16 0.45 auralin 57u 64u \n",
"26 7.52 7.11 0.41 auralin 54u 67u \n",
"27 7.71 7.34 0.37 auralin 34u 42u \n",
"28 7.87 7.47 0.40 auralin 29u 37u \n",
"29 7.74 7.32 0.42 auralin 61u 67u \n",
".. ... ... ... ... ... ... \n",
"319 7.98 7.60 0.38 novodra 37u 38u \n",
"320 7.61 7.14 0.47 novodra 28u 24u \n",
"321 7.99 7.56 0.43 novodra 24u 26u \n",
"322 7.73 7.34 0.39 novodra 20u 21u \n",
"323 7.71 7.30 0.41 novodra 46u 46u \n",
"324 7.54 7.17 0.37 novodra 26u 27u \n",
"325 7.51 7.08 0.43 novodra 34u 33u \n",
"326 7.89 7.55 0.34 novodra 30u 35u \n",
"327 7.71 7.30 0.41 novodra 33u 33u \n",
"328 8.00 7.50 0.50 novodra 35u 34u \n",
"329 7.79 7.45 0.34 novodra 30u 27u \n",
"330 7.70 7.36 0.34 novodra 37u 41u \n",
"331 7.84 7.37 0.47 novodra 43u 40u \n",
"332 8.90 8.57 0.33 novodra 34u 34u \n",
"333 7.64 7.24 0.40 novodra 40u 36u \n",
"334 7.54 7.26 0.28 novodra 37u 40u \n",
"335 7.67 7.21 0.46 novodra 26u 23u \n",
"336 9.41 8.94 0.47 novodra 40u 45u \n",
"337 7.68 7.36 0.32 novodra 52u 52u \n",
"338 7.78 7.39 0.39 novodra 37u 39u \n",
"339 7.63 7.25 0.38 novodra 42u 42u \n",
"340 7.92 7.52 0.40 novodra 26u 25u \n",
"341 7.99 7.70 0.29 novodra 36u 33u \n",
"342 7.55 7.18 0.37 novodra 42u 41u \n",
"343 7.65 7.26 0.39 novodra 32u 33u \n",
"344 7.51 7.06 0.45 novodra 55u 51u \n",
"345 7.67 7.30 0.37 novodra 26u 23u \n",
"346 9.21 8.80 0.41 novodra 22u 23u \n",
"347 7.96 7.51 0.45 novodra 28u 26u \n",
"348 7.68 7.21 0.47 novodra 42u 44u \n",
"\n",
" adverse_reaction patient_id \n",
"0 NaN 225 \n",
"1 NaN 242 \n",
"2 NaN 345 \n",
"3 NaN 276 \n",
"4 NaN 15 \n",
"5 hypoglycemia 70 \n",
"6 hypoglycemia 70 \n",
"7 NaN 18 \n",
"8 NaN 424 \n",
"9 NaN 292 \n",
"10 NaN 211 \n",
"11 NaN 133 \n",
"12 NaN 316 \n",
"13 NaN 101 \n",
"14 throat irritation 451 \n",
"15 NaN 335 \n",
"16 NaN 389 \n",
"17 NaN 71 \n",
"18 NaN 297 \n",
"19 NaN 188 \n",
"20 NaN 282 \n",
"21 NaN 174 \n",
"22 NaN 146 \n",
"23 NaN 35 \n",
"24 NaN 350 \n",
"25 NaN 220 \n",
"26 NaN 102 \n",
"27 NaN 181 \n",
"28 NaN 466 \n",
"29 NaN 205 \n",
".. ... ... \n",
"319 NaN 187 \n",
"320 NaN 234 \n",
"321 NaN 479 \n",
"322 NaN 49 \n",
"323 NaN 356 \n",
"324 NaN 208 \n",
"325 injection site discomfort 373 \n",
"326 NaN 63 \n",
"327 NaN 1 \n",
"328 NaN 291 \n",
"329 NaN 363 \n",
"330 NaN 465 \n",
"331 NaN 421 \n",
"332 injection site discomfort 376 \n",
"333 NaN 463 \n",
"334 NaN 81 \n",
"335 NaN 322 \n",
"336 NaN 392 \n",
"337 NaN 262 \n",
"338 NaN 68 \n",
"339 NaN 152 \n",
"340 NaN 431 \n",
"341 NaN 450 \n",
"342 NaN 194 \n",
"343 NaN 7 \n",
"344 nausea 153 \n",
"345 NaN 420 \n",
"346 injection site discomfort 336 \n",
"347 hypoglycemia 25 \n",
"348 injection site discomfort 477 \n",
"\n",
"[349 rows x 8 columns]"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Confirm the merge was executed correctly\n",
"treatments_clean"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22 patient_id\n",
"dtype: object"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Patient ID should be the only duplicate column\n",
"all_columns = pd.Series(list(patients_clean) + list(treatments_clean))\n",
"all_columns[all_columns.duplicated()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Quality"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Zip code is a float not a string and Zip code has four digits sometimes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Convert the zip code column's data type from a float to a string using `astype`, remove the '.0' using string slicing, and pad four digit zip codes with a leading 0."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"patients_clean.zip_code = patients_clean.zip_code.astype(str).str[:-2].str.pad(5, fillchar='0')\n",
"# Reconvert NaNs entries that were converted to '0000n' by code above\n",
"patients_clean.zip_code = patients_clean.zip_code.replace('0000n', np.nan)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0 92390\n",
"1 61812\n",
"2 68467\n",
"3 07095\n",
"4 36303\n",
"Name: zip_code, dtype: object"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean.zip_code.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Tim Neudorf height is 27 in instead of 72 in"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Replace height for rows in the `patients` table that have a height of 27 in (there is only one) with 72 in."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"patients_clean.height = patients_clean.height.replace(27, 72)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" <th>phone_number</th>\n",
" <th>email</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [patient_id, assigned_sex, given_name, surname, address, city, state, zip_code, country, birthdate, weight, height, bmi, phone_number, email]\n",
"Index: []"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Should be empty\n",
"patients_clean[patients_clean.height == 27]"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" <th>phone_number</th>\n",
" <th>email</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>5</td>\n",
" <td>male</td>\n",
" <td>Tim</td>\n",
" <td>Neudorf</td>\n",
" <td>1428 Turkey Pen Lane</td>\n",
" <td>Dothan</td>\n",
" <td>AL</td>\n",
" <td>36303</td>\n",
" <td>United States</td>\n",
" <td>2/18/1928</td>\n",
" <td>192.3</td>\n",
" <td>72</td>\n",
" <td>26.1</td>\n",
" <td>334-515-7487</td>\n",
" <td>TimNeudorf@cuvox.de</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname address city \\\n",
"4 5 male Tim Neudorf 1428 Turkey Pen Lane Dothan \n",
"\n",
" state zip_code country birthdate weight height bmi \\\n",
"4 AL 36303 United States 2/18/1928 192.3 72 26.1 \n",
"\n",
" phone_number email \n",
"4 334-515-7487 TimNeudorf@cuvox.de "
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Confirm the replacement worked\n",
"patients_clean[patients_clean.surname == 'Neudorf']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Full state names sometimes, abbreviations other times"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Apply a function that converts full state name to state abbreviation for California, New York, Illinois, Florida, and Nebraska."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"# Mapping from full state name to abbreviation\n",
"state_abbrev = {'California': 'CA',\n",
" 'New York': 'NY',\n",
" 'Illinois': 'IL',\n",
" 'Florida': 'FL',\n",
" 'Nebraska': 'NE'}\n",
"\n",
"# Function to apply\n",
"def abbreviate_state(patient):\n",
" if patient['state'] in state_abbrev.keys():\n",
" abbrev = state_abbrev[patient['state']]\n",
" return abbrev\n",
" else:\n",
" return patient['state']\n",
" \n",
"patients_clean['state'] = patients_clean.apply(abbreviate_state, axis=1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"CA 60\n",
"NY 47\n",
"TX 32\n",
"IL 24\n",
"MA 22\n",
"FL 22\n",
"PA 18\n",
"GA 15\n",
"OH 14\n",
"LA 13\n",
"MI 13\n",
"OK 13\n",
"NJ 12\n",
"VA 11\n",
"WI 10\n",
"MS 10\n",
"AL 9\n",
"MN 9\n",
"TN 9\n",
"IN 9\n",
"WA 8\n",
"NC 8\n",
"KY 8\n",
"MO 7\n",
"KS 6\n",
"NV 6\n",
"ID 6\n",
"NE 6\n",
"CT 5\n",
"SC 5\n",
"IA 5\n",
"AR 4\n",
"RI 4\n",
"ME 4\n",
"AZ 4\n",
"CO 4\n",
"ND 4\n",
"OR 3\n",
"DE 3\n",
"WV 3\n",
"SD 3\n",
"MD 3\n",
"MT 2\n",
"VT 2\n",
"DC 2\n",
"AK 1\n",
"NH 1\n",
"WY 1\n",
"NM 1\n",
"Name: state, dtype: int64"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean.state.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 CA\n",
"1 IL\n",
"2 NE\n",
"3 NJ\n",
"4 AL\n",
"5 FL\n",
"6 NV\n",
"7 CA\n",
"8 MO\n",
"9 NY\n",
"10 MI\n",
"11 TN\n",
"12 VA\n",
"13 AL\n",
"14 OK\n",
"15 VA\n",
"16 CA\n",
"17 GA\n",
"18 MT\n",
"19 CA\n",
"20 MA\n",
"21 NJ\n",
"22 NY\n",
"23 FL\n",
"24 NY\n",
"25 NM\n",
"26 IL\n",
"27 LA\n",
"28 PA\n",
"30 CO\n",
" ..\n",
"472 MS\n",
"473 MA\n",
"474 GA\n",
"475 CA\n",
"476 PA\n",
"477 SC\n",
"478 CA\n",
"479 CA\n",
"480 TX\n",
"481 OK\n",
"482 TX\n",
"483 TN\n",
"484 NC\n",
"485 FL\n",
"486 TX\n",
"487 KS\n",
"488 FL\n",
"489 CA\n",
"490 NY\n",
"491 TX\n",
"492 CA\n",
"493 ID\n",
"494 MI\n",
"495 CA\n",
"496 CA\n",
"497 OK\n",
"498 ME\n",
"499 AZ\n",
"500 MO\n",
"501 WA\n",
"Name: state, Length: 494, dtype: category\n",
"Categories (49, object): [AK, AL, AR, AZ, ..., WA, WI, WV, WY]"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean.state"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Dsvid Gustafsson"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Replace given name for rows in the `patients` table that have a given name of 'Dsvid' with 'David'."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"patients_clean.given_name = patients_clean.given_name.replace('Dsvid', 'David')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" <th>phone_number</th>\n",
" <th>email</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>9</td>\n",
" <td>male</td>\n",
" <td>David</td>\n",
" <td>Gustafsson</td>\n",
" <td>1790 Nutter Street</td>\n",
" <td>Kansas City</td>\n",
" <td>MO</td>\n",
" <td>64105</td>\n",
" <td>United States</td>\n",
" <td>3/6/1937</td>\n",
" <td>163.9</td>\n",
" <td>66</td>\n",
" <td>26.5</td>\n",
" <td>816-265-9578</td>\n",
" <td>DavidGustafsson@armyspy.com</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname address \\\n",
"8 9 male David Gustafsson 1790 Nutter Street \n",
"\n",
" city state zip_code country birthdate weight height bmi \\\n",
"8 Kansas City MO 64105 United States 3/6/1937 163.9 66 26.5 \n",
"\n",
" phone_number email \n",
"8 816-265-9578 DavidGustafsson@armyspy.com "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean[patients_clean.surname == 'Gustafsson']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Erroneous datatypes (assigned sex, state, zip_code, and birthdate columns) and Erroneous datatypes (auralin and novodra columns) and The letter 'u' in starting and ending doses for Auralin and Novodra"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Convert assigned sex and state to categorical data types. Zip code data type was already addressed above. Convert birthdate to datetime data type. Strip the letter 'u' in start dose and end dose and convert those columns to data type integer."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"# To category\n",
"patients_clean.assigned_sex = patients_clean.assigned_sex.astype('category')\n",
"patients_clean.state = patients_clean.state.astype('category')\n",
"\n",
"# To datetime\n",
"patients_clean.birthdate = pd.to_datetime(patients_clean.birthdate)\n",
"\n",
"# Strip u and to integer\n",
"treatments_clean.dose_start = treatments_clean.dose_start.str.strip('u').astype(int)\n",
"treatments_clean.dose_end = treatments_clean.dose_end.str.strip('u').astype(int)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 503 entries, 0 to 502\n",
"Data columns (total 15 columns):\n",
"patient_id 503 non-null int64\n",
"assigned_sex 503 non-null category\n",
"given_name 503 non-null object\n",
"surname 503 non-null object\n",
"address 491 non-null object\n",
"city 491 non-null object\n",
"state 491 non-null category\n",
"zip_code 491 non-null object\n",
"country 491 non-null object\n",
"birthdate 503 non-null datetime64[ns]\n",
"weight 503 non-null float64\n",
"height 503 non-null int64\n",
"bmi 503 non-null float64\n",
"phone_number 491 non-null object\n",
"email 491 non-null object\n",
"dtypes: category(2), datetime64[ns](1), float64(2), int64(2), object(8)\n",
"memory usage: 53.9+ KB\n"
]
}
],
"source": [
"patients_clean.info()"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 349 entries, 0 to 348\n",
"Data columns (total 8 columns):\n",
"hba1c_start 349 non-null float64\n",
"hba1c_end 349 non-null float64\n",
"hba1c_change 349 non-null float64\n",
"treatment 349 non-null object\n",
"dose_start 349 non-null int64\n",
"dose_end 349 non-null int64\n",
"adverse_reaction 35 non-null object\n",
"patient_id 349 non-null int64\n",
"dtypes: float64(3), int64(3), object(2)\n",
"memory usage: 24.5+ KB\n"
]
}
],
"source": [
"treatments_clean.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Multiple phone number formats"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Strip all \" \", \"-\", \"(\", \")\", and \"+\" and store each number without any formatting. Pad the phone number with a 1 if the length of the number is 10 digits (we want country code)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"patients_clean.phone_number = patients_clean.phone_number.str.replace(r'\\D+', '').str.pad(11, fillchar='1')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"0 19517199170\n",
"1 12175693204\n",
"2 14023636804\n",
"3 17326368246\n",
"4 13345157487\n",
"Name: phone_number, dtype: object"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean.phone_number.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Default John Doe data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Remove the non-recoverable John Doe records from the `patients` table."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"patients_clean = patients_clean[patients_clean.surname != 'Doe']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Jakobsen 3\n",
"Taylor 3\n",
"Dratchev 2\n",
"Berg 2\n",
"Kowalczyk 2\n",
"Aranda 2\n",
"Grímsdóttir 2\n",
"Gersten 2\n",
"Silva 2\n",
"Johnson 2\n",
"Lâm 2\n",
"Cabrera 2\n",
"Ogochukwu 2\n",
"Collins 2\n",
"Cindrić 2\n",
"Batukayev 2\n",
"Hueber 2\n",
"Kadyrov 2\n",
"Nilsen 2\n",
"Parker 2\n",
"Schiavone 2\n",
"Liễu 2\n",
"Tạ 2\n",
"Woźniak 2\n",
"Souza 2\n",
"Tucker 2\n",
"Lund 2\n",
"Bùi 2\n",
"Lương 2\n",
"Correia 2\n",
" ..\n",
"Isaksson 1\n",
"Tobeolisa 1\n",
"Novosel 1\n",
"Totth 1\n",
"Kos 1\n",
"McGregor 1\n",
"Madrid 1\n",
"Mancini 1\n",
"Shuler 1\n",
"Klobučar 1\n",
"Galić 1\n",
"Lončar 1\n",
"Moore 1\n",
"Vaneker 1\n",
"Uspenskaya 1\n",
"Hsu 1\n",
"Komavec 1\n",
"Gyenes 1\n",
"Pecinová 1\n",
"Sleiman 1\n",
"Isa 1\n",
"Mortensen 1\n",
"Lê 1\n",
"Afanasyeva 1\n",
"Hill 1\n",
"Vaara 1\n",
"Reilly 1\n",
"Yonatan 1\n",
"Henzen 1\n",
"Musliyevich 1\n",
"Name: surname, Length: 465, dtype: int64"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Should be no Doe records\n",
"patients_clean.surname.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"2778 North Avenue 2\n",
"648 Old Dear Lane 2\n",
"2476 Fulton Street 2\n",
"3542 Robinson Court 1\n",
"3499 Baker Avenue 1\n",
"1079 Ingram Street 1\n",
"3072 Braxton Street 1\n",
"570 Alpha Avenue 1\n",
"4851 Andy Street 1\n",
"441 Tibbs Avenue 1\n",
"192 Patton Lane 1\n",
"649 Nutter Street 1\n",
"2645 Moore Avenue 1\n",
"1330 Lincoln Street 1\n",
"2931 Romano Street 1\n",
"1731 Chandler Drive 1\n",
"3008 Walkers Ridge Way 1\n",
"494 Clarksburg Park Road 1\n",
"3113 Timber Ridge Road 1\n",
"475 Preston Street 1\n",
"4649 Worley Avenue 1\n",
"2578 Tenmile 1\n",
"1774 George Avenue 1\n",
"4310 Johnson Street 1\n",
"353 Whaley Lane 1\n",
"4111 Thunder Road 1\n",
"4386 Camden Street 1\n",
"4500 Myra Street 1\n",
"783 Callison Lane 1\n",
"3464 Big Indian 1\n",
" ..\n",
"4839 North Avenue 1\n",
"577 Chipmunk Lane 1\n",
"2356 Myra Street 1\n",
"3141 Brentwood Drive 1\n",
"4220 Simpson Square 1\n",
"3006 Maple Court 1\n",
"1463 Martha Ellen Drive 1\n",
"3781 Hamill Avenue 1\n",
"3402 Kildeer Drive 1\n",
"954 Summit Park Avenue 1\n",
"2775 Single Street 1\n",
"1368 Yorkshire Circle 1\n",
"2270 Bel Meadow Drive 1\n",
"1934 August Lane 1\n",
"1942 Harry Place 1\n",
"4093 Smith Street 1\n",
"2813 Frederick Street 1\n",
"2324 Benson Street 1\n",
"3391 Marcus Street 1\n",
"2970 Forest Avenue 1\n",
"550 Cliffside Drive 1\n",
"1373 Wilmar Farm Road 1\n",
"1815 Garrett Street 1\n",
"3868 Freed Drive 1\n",
"212 Tibbs Avenue 1\n",
"3595 Stuart Street 1\n",
"1526 Tully Street 1\n",
"2146 Willow Greene Drive 1\n",
"4103 Musgrave Street 1\n",
"3314 Rocket Drive 1\n",
"Name: address, Length: 482, dtype: int64"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Should be no 123 Main Street records\n",
"patients_clean.address.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Multiple records for Jakobsen, Gersten, Taylor"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Remove the Jake Jakobsen, Pat Gersten, and Sandy Taylor rows from the `patients` table. These are the nicknames, which happen to also not be in the `treatments` table (removing the wrong name would create a consistency issue between the `patients` and `treatments` table). These are all the second occurrence of the duplicate. These are also the only occurences of non-null duplicate addresses."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"# tilde means not: http://pandas.pydata.org/pandas-docs/stable/indexing.html#boolean-indexing\n",
"patients_clean = patients_clean[~((patients_clean.address.duplicated()) & patients_clean.address.notnull())]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" <th>phone_number</th>\n",
" <th>email</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>25</td>\n",
" <td>male</td>\n",
" <td>Jakob</td>\n",
" <td>Jakobsen</td>\n",
" <td>648 Old Dear Lane</td>\n",
" <td>Port Jervis</td>\n",
" <td>NY</td>\n",
" <td>12771</td>\n",
" <td>United States</td>\n",
" <td>1985-08-01</td>\n",
" <td>155.8</td>\n",
" <td>67</td>\n",
" <td>24.4</td>\n",
" <td>18458587707</td>\n",
" <td>JakobCJakobsen@einrot.com</td>\n",
" </tr>\n",
" <tr>\n",
" <th>432</th>\n",
" <td>433</td>\n",
" <td>female</td>\n",
" <td>Karen</td>\n",
" <td>Jakobsen</td>\n",
" <td>1690 Fannie Street</td>\n",
" <td>Houston</td>\n",
" <td>TX</td>\n",
" <td>77020</td>\n",
" <td>United States</td>\n",
" <td>1962-11-25</td>\n",
" <td>185.2</td>\n",
" <td>67</td>\n",
" <td>29.0</td>\n",
" <td>19792030438</td>\n",
" <td>KarenJakobsen@jourrapide.com</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname address \\\n",
"24 25 male Jakob Jakobsen 648 Old Dear Lane \n",
"432 433 female Karen Jakobsen 1690 Fannie Street \n",
"\n",
" city state zip_code country birthdate weight height \\\n",
"24 Port Jervis NY 12771 United States 1985-08-01 155.8 67 \n",
"432 Houston TX 77020 United States 1962-11-25 185.2 67 \n",
"\n",
" bmi phone_number email \n",
"24 24.4 18458587707 JakobCJakobsen@einrot.com \n",
"432 29.0 19792030438 KarenJakobsen@jourrapide.com "
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean[patients_clean.surname == 'Jakobsen']"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" <th>phone_number</th>\n",
" <th>email</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>98</td>\n",
" <td>male</td>\n",
" <td>Patrick</td>\n",
" <td>Gersten</td>\n",
" <td>2778 North Avenue</td>\n",
" <td>Burr</td>\n",
" <td>NE</td>\n",
" <td>68324</td>\n",
" <td>United States</td>\n",
" <td>1954-05-03</td>\n",
" <td>138.2</td>\n",
" <td>71</td>\n",
" <td>19.3</td>\n",
" <td>14028484923</td>\n",
" <td>PatrickGersten@rhyta.com</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname address city \\\n",
"97 98 male Patrick Gersten 2778 North Avenue Burr \n",
"\n",
" state zip_code country birthdate weight height bmi \\\n",
"97 NE 68324 United States 1954-05-03 138.2 71 19.3 \n",
"\n",
" phone_number email \n",
"97 14028484923 PatrickGersten@rhyta.com "
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean[patients_clean.surname == 'Gersten']"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>patient_id</th>\n",
" <th>assigned_sex</th>\n",
" <th>given_name</th>\n",
" <th>surname</th>\n",
" <th>address</th>\n",
" <th>city</th>\n",
" <th>state</th>\n",
" <th>zip_code</th>\n",
" <th>country</th>\n",
" <th>birthdate</th>\n",
" <th>weight</th>\n",
" <th>height</th>\n",
" <th>bmi</th>\n",
" <th>phone_number</th>\n",
" <th>email</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>131</th>\n",
" <td>132</td>\n",
" <td>female</td>\n",
" <td>Sandra</td>\n",
" <td>Taylor</td>\n",
" <td>2476 Fulton Street</td>\n",
" <td>Rainelle</td>\n",
" <td>WV</td>\n",
" <td>25962</td>\n",
" <td>United States</td>\n",
" <td>1960-10-23</td>\n",
" <td>206.1</td>\n",
" <td>64</td>\n",
" <td>35.4</td>\n",
" <td>13044382648</td>\n",
" <td>SandraCTaylor@dayrep.com</td>\n",
" </tr>\n",
" <tr>\n",
" <th>426</th>\n",
" <td>427</td>\n",
" <td>male</td>\n",
" <td>Rogelio</td>\n",
" <td>Taylor</td>\n",
" <td>4064 Marigold Lane</td>\n",
" <td>Miami</td>\n",
" <td>FL</td>\n",
" <td>33179</td>\n",
" <td>United States</td>\n",
" <td>1992-09-02</td>\n",
" <td>186.6</td>\n",
" <td>69</td>\n",
" <td>27.6</td>\n",
" <td>13054346299</td>\n",
" <td>RogelioJTaylor@teleworm.us</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" patient_id assigned_sex given_name surname address city \\\n",
"131 132 female Sandra Taylor 2476 Fulton Street Rainelle \n",
"426 427 male Rogelio Taylor 4064 Marigold Lane Miami \n",
"\n",
" state zip_code country birthdate weight height bmi \\\n",
"131 WV 25962 United States 1960-10-23 206.1 64 35.4 \n",
"426 FL 33179 United States 1992-09-02 186.6 69 27.6 \n",
"\n",
" phone_number email \n",
"131 13044382648 SandraCTaylor@dayrep.com \n",
"426 13054346299 RogelioJTaylor@teleworm.us "
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"patients_clean[patients_clean.surname == 'Taylor']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### kgs instead of lbs for Zaitseva weight"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Define\n",
"Use [advanced indexing](https://stackoverflow.com/a/44913631) to isolate the row where the surname is Zaitseva and convert the entry in its weight field from kg to lbs."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Code"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"weight_kg = patients_clean.weight.min()\n",
"mask = patients_clean.surname == 'Zaitseva'\n",
"column_name = 'weight'\n",
"patients_clean.loc[mask, column_name] = weight_kg * 2.20462"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"##### Test"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"459 102.100000\n",
"335 102.700000\n",
"74 103.200000\n",
"317 106.000000\n",
"171 106.500000\n",
"51 107.100000\n",
"210 107.585456\n",
"270 108.100000\n",
"198 108.500000\n",
"48 109.100000\n",
"478 109.600000\n",
"141 110.200000\n",
"38 111.800000\n",
"438 112.000000\n",
"14 112.000000\n",
"235 112.200000\n",
"307 112.400000\n",
"191 112.600000\n",
"408 113.100000\n",
"49 113.300000\n",
"326 114.000000\n",
"338 114.100000\n",
"253 117.000000\n",
"321 118.400000\n",
"168 118.800000\n",
"1 118.800000\n",
"350 119.000000\n",
"207 119.200000\n",
"265 120.000000\n",
"341 120.300000\n",
" ... \n",
"332 224.000000\n",
"12 224.200000\n",
"252 224.200000\n",
"222 224.800000\n",
"166 225.300000\n",
"111 225.900000\n",
"101 226.200000\n",
"150 226.600000\n",
"88 227.700000\n",
"352 227.700000\n",
"428 227.700000\n",
"13 228.400000\n",
"339 229.000000\n",
"182 230.300000\n",
"121 230.800000\n",
"257 231.700000\n",
"395 231.900000\n",
"246 232.100000\n",
"219 237.800000\n",
"11 238.700000\n",
"50 238.900000\n",
"441 239.100000\n",
"499 239.600000\n",
"439 242.000000\n",
"487 242.400000\n",
"144 244.900000\n",
"61 244.900000\n",
"283 245.500000\n",
"118 254.500000\n",
"485 255.900000\n",
"Name: weight, Length: 494, dtype: float64"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 48.8 shouldn't be the lowest anymore\n",
"patients_clean.weight.sort_values()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment