fonnesbeck/Dataset Processing.ipynb

## Dataset Processing.ipynb
{
  "cells": [
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Import modules and set options\n%matplotlib inline\nimport matplotlib.pyplot as plt\nimport pandas as pd\nimport numpy as np\nimport seaborn as sns",
      "execution_count": 1,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "DATA_DIR = '../data/clean/'",
      "execution_count": 2,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Connect to database to import data for the three test domains and demographic information:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "from redcap import Project\napi_url = 'https://redcap.vanderbilt.edu/api/'\napi_key = open(\"/Users/fonnescj/Dropbox/Collaborations/LSL-DR/api_token.txt\").read()\n\nlsl_dr_project = Project(api_url, api_key)",
      "execution_count": 3,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "metadata = lsl_dr_project.export_metadata()",
      "execution_count": 4,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Import each database from REDCap:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "articulation_fields = ['study_id','redcap_event_name', 'age_test_aaps','aaps_ss','age_test_gf2','gf2_ss']\narticulation = lsl_dr_project.export_records(fields=articulation_fields, format='df', df_kwargs={'index_col':None,\n                                                                                                'na_values':[999, 9999]})",
      "execution_count": 5,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "records = lsl_dr_project.export_records(fields=articulation_fields)",
      "execution_count": 6,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "print(records[0]['study_id'])",
      "execution_count": 7,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "0101-2002-0101\n"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "expressive_fields = ['study_id','redcap_event_name','age_test_eowpvt','eowpvt_ss','age_test_evt','evt_ss']\nexpressive = lsl_dr_project.export_records(fields=expressive_fields, format='df', \n                                           df_kwargs={'index_col':None,\n                                                      'na_values':[999, 9999]})",
      "execution_count": 8,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "receptive_fields = ['study_id','redcap_event_name','age_test_ppvt','ppvt_ss','age_test_rowpvt','rowpvt_ss']\nreceptive = lsl_dr_project.export_records(fields=receptive_fields, format='df', \n                                          df_kwargs={'index_col':None,\n                                                     'na_values':[999, 9999]})",
      "execution_count": 9,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "language_fields = ['study_id','redcap_event_name','pls_ac_ss','pls_ec_ss','pls_choice','age_test_pls',\n                   'owls_lc_ss','owls_oe_ss','age_test_owls',\n                   'celfp_rl_ss','celfp_el_ss','age_test_celp',\n                   'celf_elss','celf_rlss','age_test_celf',\n                   'celfp_ss_ss', 'celfp_ws_ss', 'celfp_ev_ss', 'celfp_fd_ss',\n                   'celfp_rs_ss', 'celfp_bc_ss', 'celfp_wcr_ss', 'celfp_wce_ss',\n                   'celfp_wct_ss']\nlanguage_raw = lsl_dr_project.export_records(fields=language_fields, format='df', \n                                             df_kwargs={'index_col':None, \n                                                        'na_values':[999, 9999]})",
      "execution_count": 10,
      "outputs": []
    },
    {
      "metadata": {
        "scrolled": true,
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic_fields = ['study_id','redcap_event_name','redcap_data_access_group', 'academic_year_rv',\n'hl','prim_lang','mother_ed','father_ed','premature_age', 'synd_cause', 'age_disenrolled', 'race',\n'onset_1','age_int','age','age_amp', 'age_ci', 'age_ci_2', 'degree_hl_ad','type_hl_ad','tech_ad','degree_hl_as',\n'type_hl_as','tech_as','etiology','etiology_2', 'sib', 'gender', 'time', 'ad_250', 'as_250', 'ae',\n'ad_500', 'as_500', 'fam_age', 'family_inv', 'demo_ses', 'school_lunch', 'medicaid', 'hearing_changes',\n'slc_fo', 'sle_fo', 'a_fo', 'funct_out_age', 'parent_hl', 'med_cause', 'known_synd', 'school_grade',\n'att_days_hr', 'att_days_sch', 'att_days_st2_417', 'optionserv_type', 'option_pop', 'otherserv']\ndemographic_raw = lsl_dr_project.export_records(fields=demographic_fields, format='df', \n                                            df_kwargs={'index_col':None, \n                                                       'na_values':[888, 999, 9999]})",
      "execution_count": 11,
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": "/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2802: DtypeWarning: Columns (27,33) have mixed types. Specify dtype option on import or set low_memory=False.\n  if self.run_code(code, result):\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Attendance information"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Several fields in the demographic data have missing values."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic_raw.head()",
      "execution_count": 12,
      "outputs": [
        {
          "data": {
            "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>study_id</th>\n      <th>redcap_event_name</th>\n      <th>academic_year_rv</th>\n      <th>hl</th>\n      <th>gender</th>\n      <th>race</th>\n      <th>prim_lang</th>\n      <th>sib</th>\n      <th>mother_ed</th>\n      <th>father_ed</th>\n      <th>...</th>\n      <th>sle_fo</th>\n      <th>a_fo</th>\n      <th>fam_age</th>\n      <th>family_inv</th>\n      <th>att_days_sch</th>\n      <th>att_days_st2_417</th>\n      <th>att_days_hr</th>\n      <th>demo_ses</th>\n      <th>school_lunch</th>\n      <th>medicaid</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0101-2002-0101</td>\n      <td>initial_assessment_arm_1</td>\n      <td>2002.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>6.0</td>\n      <td>6.0</td>\n      <td>...</td>\n      <td>2.0</td>\n      <td>2.0</td>\n      <td>54.0</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>0101-2002-0101</td>\n      <td>year_1_complete_71_arm_1</td>\n      <td>2003.0</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>4.0</td>\n      <td>4.0</td>\n      <td>80.0</td>\n      <td>1.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0101-2002-0101</td>\n      <td>year_2_complete_71_arm_1</td>\n      <td>2004.0</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>4.0</td>\n      <td>4.0</td>\n      <td>80.0</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>3</th>\n      <td>0101-2002-0101</td>\n      <td>year_3_complete_71_arm_1</td>\n      <td>2005.0</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>5.0</td>\n      <td>5.0</td>\n      <td>96.0</td>\n      <td>3.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n    <tr>\n      <th>4</th>\n      <td>0101-2002-0101</td>\n      <td>year_4_complete_71_arm_1</td>\n      <td>2006.0</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>5.0</td>\n      <td>5.0</td>\n      <td>109.0</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 56 columns</p>\n</div>",
            "text/plain": "         study_id         redcap_event_name  academic_year_rv   hl  gender  \\\n0  0101-2002-0101  initial_assessment_arm_1            2002.0  0.0     0.0   \n1  0101-2002-0101  year_1_complete_71_arm_1            2003.0  0.0     NaN   \n2  0101-2002-0101  year_2_complete_71_arm_1            2004.0  0.0     NaN   \n3  0101-2002-0101  year_3_complete_71_arm_1            2005.0  0.0     NaN   \n4  0101-2002-0101  year_4_complete_71_arm_1            2006.0  0.0     NaN   \n\n   race  prim_lang  sib  mother_ed  father_ed    ...     sle_fo  a_fo  \\\n0   0.0        0.0  1.0        6.0        6.0    ...        2.0   2.0   \n1   NaN        NaN  NaN        NaN        NaN    ...        4.0   4.0   \n2   NaN        NaN  NaN        NaN        NaN    ...        4.0   4.0   \n3   NaN        NaN  NaN        NaN        NaN    ...        5.0   5.0   \n4   NaN        NaN  NaN        NaN        NaN    ...        5.0   5.0   \n\n   fam_age  family_inv  att_days_sch  att_days_st2_417  att_days_hr  demo_ses  \\\n0     54.0         2.0           NaN               NaN          NaN       NaN   \n1     80.0         1.0           NaN               NaN          NaN       NaN   \n2     80.0         2.0           NaN               NaN          NaN       NaN   \n3     96.0         3.0           NaN               NaN          NaN       NaN   \n4    109.0         2.0           NaN               NaN          NaN       NaN   \n\n   school_lunch  medicaid  \n0           NaN       NaN  \n1           NaN       NaN  \n2           NaN       NaN  \n3           NaN       NaN  \n4           NaN       NaN  \n\n[5 rows x 56 columns]"
          },
          "execution_count": 12,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "We can fill missing values forward from previous observation (by `study_id`)"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic = demographic_raw.sort_values(by='redcap_event_name').groupby('study_id').transform(\n                                    lambda recs: recs.fillna(method='ffill'))#.reset_index()\ndemographic[\"study_id\"] = demographic_raw.sort_values(by='redcap_event_name').study_id",
      "execution_count": 16,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Random check to make sure this worked"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic[demographic.study_id=='1147-2010-0064']",
      "execution_count": 17,
      "outputs": [
        {
          "data": {
            "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>redcap_event_name</th>\n      <th>academic_year_rv</th>\n      <th>hl</th>\n      <th>gender</th>\n      <th>race</th>\n      <th>prim_lang</th>\n      <th>sib</th>\n      <th>mother_ed</th>\n      <th>father_ed</th>\n      <th>parent_hl</th>\n      <th>...</th>\n      <th>a_fo</th>\n      <th>fam_age</th>\n      <th>family_inv</th>\n      <th>att_days_sch</th>\n      <th>att_days_st2_417</th>\n      <th>att_days_hr</th>\n      <th>demo_ses</th>\n      <th>school_lunch</th>\n      <th>medicaid</th>\n      <th>study_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>17762</th>\n      <td>initial_assessment_arm_1</td>\n      <td>2010.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>3.0</td>\n      <td>3.0</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>6.0</td>\n      <td>65.0</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>1147-2010-0064</td>\n    </tr>\n    <tr>\n      <th>17763</th>\n      <td>year_1_complete_71_arm_1</td>\n      <td>2011.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>3.0</td>\n      <td>3.0</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>5.0</td>\n      <td>77.0</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>1147-2010-0064</td>\n    </tr>\n    <tr>\n      <th>17764</th>\n      <td>year_2_complete_71_arm_1</td>\n      <td>2012.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>3.0</td>\n      <td>3.0</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>5.0</td>\n      <td>89.0</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>1147-2010-0064</td>\n    </tr>\n    <tr>\n      <th>17765</th>\n      <td>year_3_complete_71_arm_1</td>\n      <td>2013.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>3.0</td>\n      <td>3.0</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>5.0</td>\n      <td>101.0</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>1147-2010-0064</td>\n    </tr>\n  </tbody>\n</table>\n<p>4 rows × 56 columns</p>\n</div>",
            "text/plain": "              redcap_event_name  academic_year_rv   hl  gender  race  \\\n17762  initial_assessment_arm_1            2010.0  0.0     0.0   0.0   \n17763  year_1_complete_71_arm_1            2011.0  0.0     0.0   0.0   \n17764  year_2_complete_71_arm_1            2012.0  0.0     0.0   0.0   \n17765  year_3_complete_71_arm_1            2013.0  0.0     0.0   0.0   \n\n       prim_lang  sib  mother_ed  father_ed  parent_hl       ...        a_fo  \\\n17762        0.0  1.0        3.0        3.0        NaN       ...         6.0   \n17763        0.0  1.0        3.0        3.0        NaN       ...         5.0   \n17764        0.0  1.0        3.0        3.0        NaN       ...         5.0   \n17765        0.0  1.0        3.0        3.0        NaN       ...         5.0   \n\n       fam_age  family_inv  att_days_sch  att_days_st2_417  att_days_hr  \\\n17762     65.0         0.0           NaN               NaN          NaN   \n17763     77.0         2.0           NaN               NaN          NaN   \n17764     89.0         2.0           NaN               NaN          NaN   \n17765    101.0         2.0           NaN               NaN          NaN   \n\n       demo_ses  school_lunch  medicaid        study_id  \n17762       NaN           NaN       NaN  1147-2010-0064  \n17763       NaN           NaN       NaN  1147-2010-0064  \n17764       NaN           NaN       NaN  1147-2010-0064  \n17765       NaN           NaN       NaN  1147-2010-0064  \n\n[4 rows x 56 columns]"
          },
          "execution_count": 17,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Demographic data without missing values:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.head()",
      "execution_count": 18,
      "outputs": [
        {
          "data": {
            "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>redcap_event_name</th>\n      <th>academic_year_rv</th>\n      <th>hl</th>\n      <th>gender</th>\n      <th>race</th>\n      <th>prim_lang</th>\n      <th>sib</th>\n      <th>mother_ed</th>\n      <th>father_ed</th>\n      <th>parent_hl</th>\n      <th>...</th>\n      <th>a_fo</th>\n      <th>fam_age</th>\n      <th>family_inv</th>\n      <th>att_days_sch</th>\n      <th>att_days_st2_417</th>\n      <th>att_days_hr</th>\n      <th>demo_ses</th>\n      <th>school_lunch</th>\n      <th>medicaid</th>\n      <th>study_id</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>initial_assessment_arm_1</td>\n      <td>2002.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>6.0</td>\n      <td>6.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>2.0</td>\n      <td>54.0</td>\n      <td>2.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0101-2002-0101</td>\n    </tr>\n    <tr>\n      <th>9458</th>\n      <td>initial_assessment_arm_1</td>\n      <td>2009.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>3.0</td>\n      <td>3.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>5.0</td>\n      <td>135.0</td>\n      <td>1.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0628-1999-0506</td>\n    </tr>\n    <tr>\n      <th>9463</th>\n      <td>initial_assessment_arm_1</td>\n      <td>2009.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>4.0</td>\n      <td>175.0</td>\n      <td>3.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0628-2000-0308</td>\n    </tr>\n    <tr>\n      <th>9469</th>\n      <td>initial_assessment_arm_1</td>\n      <td>2009.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>6.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>3.0</td>\n      <td>5.0</td>\n      <td>NaN</td>\n      <td>...</td>\n      <td>6.0</td>\n      <td>100.0</td>\n      <td>0.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0628-2001-0034</td>\n    </tr>\n    <tr>\n      <th>9473</th>\n      <td>initial_assessment_arm_1</td>\n      <td>2009.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>2.0</td>\n      <td>2.0</td>\n      <td>2.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>4.0</td>\n      <td>154.0</td>\n      <td>4.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0628-2001-0289</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 56 columns</p>\n</div>",
            "text/plain": "             redcap_event_name  academic_year_rv   hl  gender  race  \\\n0     initial_assessment_arm_1            2002.0  0.0     0.0   0.0   \n9458  initial_assessment_arm_1            2009.0  0.0     1.0   2.0   \n9463  initial_assessment_arm_1            2009.0  0.0     0.0   2.0   \n9469  initial_assessment_arm_1            2009.0  0.0     0.0   6.0   \n9473  initial_assessment_arm_1            2009.0  0.0     0.0   2.0   \n\n      prim_lang  sib  mother_ed  father_ed  parent_hl       ...        a_fo  \\\n0           0.0  1.0        6.0        6.0        0.0       ...         2.0   \n9458        0.0  1.0        3.0        3.0        0.0       ...         5.0   \n9463        0.0  1.0        2.0        0.0        0.0       ...         4.0   \n9469        0.0  1.0        3.0        5.0        NaN       ...         6.0   \n9473        0.0  2.0        2.0        2.0        0.0       ...         4.0   \n\n      fam_age  family_inv  att_days_sch  att_days_st2_417  att_days_hr  \\\n0        54.0         2.0           NaN               NaN          NaN   \n9458    135.0         1.0           NaN               NaN          NaN   \n9463    175.0         3.0           NaN               NaN          NaN   \n9469    100.0         0.0           NaN               NaN          NaN   \n9473    154.0         4.0           NaN               NaN          NaN   \n\n      demo_ses  school_lunch  medicaid        study_id  \n0          NaN           NaN       NaN  0101-2002-0101  \n9458       NaN           NaN       NaN  0628-1999-0506  \n9463       NaN           NaN       NaN  0628-2000-0308  \n9469       NaN           NaN       NaN  0628-2001-0034  \n9473       NaN           NaN       NaN  0628-2001-0289  \n\n[5 rows x 56 columns]"
          },
          "execution_count": 18,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Cleaning languge dataset\n\n5 language measures:\n\n- 3 versions of CELF\n- PLS\n    - pls_ac_rs: \tPLS: Auditory Comprehension Raw Score\n    - pls_ac_ss: \tPLS: Auditory Comprehension Standard Score\n    - pls_ec_rs: \tPLS: Expressive Communication Raw Score\n    - pls_ec_ss: \tPLS: Expressive Communication Standard Score\n    - pls_tl_rs: \tPLS: Total Language Score Standard Score Total\n    - pls_tl_ss: \tPLS: Total Language Score Standard Score\n- OWLS\n    - age_test_owls: \tAge at time of testing (OWLS)\n    - owls_lc_rs: \tOWLS: Listening Comprehension Raw Score\n    - owls_lc_ss: \tOWLS: Listening Comprehension Standard Score\n    - owls_oe_rs: \tOWLS: Oral Expression Raw Score\n    - owls_oe_ss: \tOWLS: Oral Expression Standard Score\n    - owls_oc_sss: \tOWLS: Oral Composite Sum of Listening Comprehension and Oral Expression Standard Scores\n    - owls_oc_ss: \tOWLS: Oral Composite Standard Score\n    - owls_wes_trs: \tOWLS: Written Expression Scale Total Raw Score\n    - owls_wes_as: \tOWLS: Written Expression Scale Ability Score\n    - owls_wes_ss: \tOWLS: Written Expression Scale Standard Score\n    - owsl_lc: \tOWLS: Written Expression Scale Language Composite (Sum of written expression age-based standard score, listening comprehension standard score and oral expression standard score)\n    - owls_lcss: \tOWLS: Language Composite Standard Score"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Test type\nlanguage_raw[\"test_name\"] = None\nlanguage_raw[\"test_type\"] = None\nlanguage_raw[\"score\"] = None\nCELP = language_raw.age_test_celp.notnull()\nCELF = language_raw.age_test_celf.notnull()\nPLS = language_raw.age_test_pls.notnull()\nOWLS = language_raw.age_test_owls.notnull()\n\nlanguage_raw['age_test'] = None\nlanguage_raw.loc[CELP, 'age_test'] = language_raw.age_test_celp\nlanguage_raw.loc[CELF, 'age_test'] = language_raw.age_test_celf\nlanguage_raw.loc[PLS, 'age_test'] = language_raw.age_test_pls\nlanguage_raw.loc[OWLS, 'age_test'] = language_raw.age_test_owls\n\nlanguage1 = language_raw[CELP | CELF | PLS | OWLS].copy()\nlanguage2 = language1.copy()\n\nlanguage1[\"test_type\"] = \"receptive\"\n\nlanguage1.loc[CELP, \"test_name\"] = \"CELF-P2\"\nlanguage1.loc[CELF, \"test_name\"] = \"CELF-4\"\nlanguage1.loc[PLS, \"test_name\"] = \"PLS\"\nlanguage1.loc[OWLS, \"test_name\"] = \"OWLS\"\n\nlanguage1.loc[CELP, \"score\"] = language1.celfp_rl_ss\nlanguage1.loc[CELF, \"score\"] = language1.celf_rlss\nlanguage1.loc[PLS, \"score\"] = language1.pls_ac_ss\nlanguage1.loc[OWLS, \"score\"] = language1.owls_lc_ss\n\n\nlanguage2[\"test_type\"] = \"expressive\"\n\nlanguage2.loc[CELP, \"test_name\"] = \"CELF-P2\"\nlanguage2.loc[CELF, \"test_name\"] = \"CELF-4\"\nlanguage2.loc[PLS, \"test_name\"] = \"PLS\"\nlanguage2.loc[OWLS, \"test_name\"] = \"OWLS\"\n\nlanguage2.loc[CELP, \"score\"] = language1.celfp_el_ss\nlanguage2.loc[CELF, \"score\"] = language1.celf_elss\nlanguage2.loc[PLS, \"score\"] = language1.pls_ec_ss\nlanguage2.loc[OWLS, \"score\"] = language1.owls_oe_ss\n\nlanguage = pd.concat([language1, language2])\nlanguage = language[language.score.notnull()]\nprint(pd.crosstab(language.test_name, language.test_type))\nprint(\"There are {0} null values for score\".format(sum(language[\"score\"].isnull())))",
      "execution_count": 19,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "test_type  expressive  receptive\ntest_name                       \nCELF-4            682        584\nCELF-P2          1927       1933\nOWLS             1370       1376\nPLS              4603       4613\nThere are 0 null values for score\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "A `school` variable was added, which is the first four columns of the `study_id`:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "language[\"school\"] = language.study_id.str.slice(0,4)",
      "execution_count": 20,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "language_subtest = language[[\"study_id\", \"redcap_event_name\", \"score\", \"test_type\", \n                             \"test_name\", \"school\", \"age_test\", \n                             'celfp_ss_ss', 'celfp_ws_ss', \n                             'celfp_ev_ss', 'celfp_fd_ss',\n                             'celfp_rs_ss', 'celfp_bc_ss', \n                             'celfp_wcr_ss', 'celfp_wce_ss',\n                             'celfp_wct_ss']]",
      "execution_count": 21,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "language = language[[\"study_id\", \"redcap_event_name\", \"score\", \"test_type\", \"test_name\", \"school\", \"age_test\"]]\nlanguage[\"domain\"] = \"Language\"\nlanguage.head()",
      "execution_count": 22,
      "outputs": [
        {
          "data": {
            "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>study_id</th>\n      <th>redcap_event_name</th>\n      <th>score</th>\n      <th>test_type</th>\n      <th>test_name</th>\n      <th>school</th>\n      <th>age_test</th>\n      <th>domain</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0101-2002-0101</td>\n      <td>initial_assessment_arm_1</td>\n      <td>51</td>\n      <td>receptive</td>\n      <td>PLS</td>\n      <td>0101</td>\n      <td>54</td>\n      <td>Language</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>0101-2002-0101</td>\n      <td>year_5_complete_71_arm_1</td>\n      <td>61</td>\n      <td>receptive</td>\n      <td>OWLS</td>\n      <td>0101</td>\n      <td>113</td>\n      <td>Language</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>0101-2003-0102</td>\n      <td>initial_assessment_arm_1</td>\n      <td>55</td>\n      <td>receptive</td>\n      <td>PLS</td>\n      <td>0101</td>\n      <td>44</td>\n      <td>Language</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>0101-2003-0102</td>\n      <td>year_1_complete_71_arm_1</td>\n      <td>77</td>\n      <td>receptive</td>\n      <td>PLS</td>\n      <td>0101</td>\n      <td>54</td>\n      <td>Language</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>0101-2003-0102</td>\n      <td>year_2_complete_71_arm_1</td>\n      <td>93</td>\n      <td>receptive</td>\n      <td>CELF-P2</td>\n      <td>0101</td>\n      <td>68</td>\n      <td>Language</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
            "text/plain": "          study_id         redcap_event_name score  test_type test_name  \\\n0   0101-2002-0101  initial_assessment_arm_1    51  receptive       PLS   \n5   0101-2002-0101  year_5_complete_71_arm_1    61  receptive      OWLS   \n9   0101-2003-0102  initial_assessment_arm_1    55  receptive       PLS   \n10  0101-2003-0102  year_1_complete_71_arm_1    77  receptive       PLS   \n11  0101-2003-0102  year_2_complete_71_arm_1    93  receptive   CELF-P2   \n\n   school age_test    domain  \n0    0101       54  Language  \n5    0101      113  Language  \n9    0101       44  Language  \n10   0101       54  Language  \n11   0101       68  Language  "
          },
          "execution_count": 22,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "language.to_csv(DATA_DIR+'language.csv')",
      "execution_count": 23,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Cleaning articulation dataset\n\nWe converted the articulation dataset into a \"long\" format:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Test type\narticulation[\"test_type\"] = None\nARIZ = articulation.aaps_ss.notnull()\nGF = articulation.gf2_ss.notnull()\narticulation = articulation[ARIZ | GF]\narticulation.loc[(ARIZ & GF), \"test_type\"] = \"Arizonia and Goldman\"\narticulation.loc[(ARIZ & ~GF), \"test_type\"] = \"Arizonia\"\narticulation.loc[(~ARIZ & GF), \"test_type\"] = \"Goldman\"\n\nprint(articulation.test_type.value_counts())\nprint(\"There are {0} null values for test_type\".format(sum(articulation[\"test_type\"].isnull())))\n\n# Test score (Arizonia if both)\narticulation[\"score\"] = articulation.aaps_ss\narticulation.loc[(~ARIZ & GF), \"score\"] = articulation.gf2_ss[~ARIZ & GF]",
      "execution_count": 24,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "Goldman                 5863\nArizonia                 563\nArizonia and Goldman      94\nName: test_type, dtype: int64\nThere are 0 null values for test_type\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "A `school` variable was added, which is the first four columns of the `study_id`:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "articulation[\"school\"] = articulation.study_id.str.slice(0,4)",
      "execution_count": 25,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "The age was taken to be the Arizonia age if there are both test types:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "articulation[\"age_test\"] = articulation.age_test_aaps\narticulation.loc[articulation.age_test.isnull(), 'age_test'] = articulation.age_test_gf2[articulation.age_test.isnull()]\nprint(articulation.age_test.describe())",
      "execution_count": 26,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "count    6518.000000\nmean       67.943081\nstd        29.729030\nmin        23.000000\n25%        47.000000\n50%        60.000000\n75%        80.000000\nmax       243.000000\nName: age_test, dtype: float64\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Finally, we dropped unwanted columns and added a domain identification column for merging:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# articulation = articulation.drop([\"age_test_aaps\", \"age_test_gf2\", \"aaps_ss\", \"gf2_ss\"], axis=1)\narticulation[\"domain\"] = \"Articulation\"\narticulation.head()",
      "execution_count": 27,
      "outputs": [
        {
          "data": {
            "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>study_id</th>\n      <th>redcap_event_name</th>\n      <th>age_test_aaps</th>\n      <th>aaps_ss</th>\n      <th>age_test_gf2</th>\n      <th>gf2_ss</th>\n      <th>test_type</th>\n      <th>score</th>\n      <th>school</th>\n      <th>age_test</th>\n      <th>domain</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>1</th>\n      <td>0101-2002-0101</td>\n      <td>year_1_complete_71_arm_1</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>80.0</td>\n      <td>78.0</td>\n      <td>Goldman</td>\n      <td>78.0</td>\n      <td>0101</td>\n      <td>80.0</td>\n      <td>Articulation</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>0101-2003-0102</td>\n      <td>initial_assessment_arm_1</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>44.0</td>\n      <td>72.0</td>\n      <td>Goldman</td>\n      <td>72.0</td>\n      <td>0101</td>\n      <td>44.0</td>\n      <td>Articulation</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>0101-2003-0102</td>\n      <td>year_1_complete_71_arm_1</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>54.0</td>\n      <td>97.0</td>\n      <td>Goldman</td>\n      <td>97.0</td>\n      <td>0101</td>\n      <td>54.0</td>\n      <td>Articulation</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>0101-2004-0101</td>\n      <td>year_2_complete_71_arm_1</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>53.0</td>\n      <td>75.0</td>\n      <td>Goldman</td>\n      <td>75.0</td>\n      <td>0101</td>\n      <td>53.0</td>\n      <td>Articulation</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>0101-2004-0101</td>\n      <td>year_3_complete_71_arm_1</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>66.0</td>\n      <td>80.0</td>\n      <td>Goldman</td>\n      <td>80.0</td>\n      <td>0101</td>\n      <td>66.0</td>\n      <td>Articulation</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
            "text/plain": "          study_id         redcap_event_name  age_test_aaps  aaps_ss  \\\n1   0101-2002-0101  year_1_complete_71_arm_1            NaN      NaN   \n9   0101-2003-0102  initial_assessment_arm_1            NaN      NaN   \n10  0101-2003-0102  year_1_complete_71_arm_1            NaN      NaN   \n14  0101-2004-0101  year_2_complete_71_arm_1            NaN      NaN   \n15  0101-2004-0101  year_3_complete_71_arm_1            NaN      NaN   \n\n    age_test_gf2  gf2_ss test_type  score school  age_test        domain  \n1           80.0    78.0   Goldman   78.0   0101      80.0  Articulation  \n9           44.0    72.0   Goldman   72.0   0101      44.0  Articulation  \n10          54.0    97.0   Goldman   97.0   0101      54.0  Articulation  \n14          53.0    75.0   Goldman   75.0   0101      53.0  Articulation  \n15          66.0    80.0   Goldman   80.0   0101      66.0  Articulation  "
          },
          "execution_count": 27,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "articulation.to_csv(DATA_DIR+'articulation.csv')",
      "execution_count": 28,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Cleaning demographic dataset\n\nWe excluded unwanted columns and rows for which age, gender or race were missing:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Retain only subset of columns\n#demographic = demographic[demographic.gender.notnull()]\ndemographic = demographic.rename(columns={'gender':'male'})",
      "execution_count": 29,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Due to sample size considerations, we reduced the non-English primary language variable to English (0) and non-English (1):"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic[\"non_english\"] = None\ndemographic.loc[demographic.prim_lang.notnull(), 'non_english'] = demographic.prim_lang[demographic.prim_lang.notnull()]>0\nprint(demographic.non_english.value_counts())\nprint(\"There are {0} null values for non_english\".format(sum(demographic.non_english.isnull())))",
      "execution_count": 30,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "False    14619\nTrue      3431\nName: non_english, dtype: int64\nThere are 501 null values for non_english\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Mother's education (`mother_ed`) and father's education (`father_ed`) were both recoded to: \n\n* 0=no high school diploma\n* 1=high school\n* 2=undergraduate\n* 3=graduate\n\nCategory 6 (unknown) was recoded as missing."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic = demographic.rename(columns={\"mother_ed\":\"_mother_ed\"})\ndemographic[\"mother_ed\"] = demographic._mother_ed.copy()\ndemographic.loc[demographic._mother_ed==1, 'mother_ed'] = 0\ndemographic.loc[(demographic._mother_ed==2) | (demographic.mother_ed==3), 'mother_ed'] = 1\ndemographic.loc[demographic._mother_ed==4, 'mother_ed'] = 2\ndemographic.loc[demographic._mother_ed==5, 'mother_ed'] = 3\ndemographic.loc[demographic._mother_ed==6, 'mother_ed'] = None\nprint(\"_mother_ed:\")\nprint(demographic._mother_ed.value_counts())\nprint(\"mother_ed:\")\nprint(demographic.mother_ed.value_counts())\nprint(\"\\nThere are {0} null values for mother_ed\".format(sum(demographic.mother_ed.isnull())))",
      "execution_count": 31,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "_mother_ed:\n6.0    6008\n4.0    4088\n3.0    2762\n5.0    2228\n2.0    1977\n1.0     605\n0.0     273\nName: _mother_ed, dtype: int64\nmother_ed:\n1.0    4739\n2.0    4088\n3.0    2228\n0.0     878\nName: mother_ed, dtype: int64\n\nThere are 6618 null values for mother_ed\n"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "parent_hl_lookup = {0: \"Both parents do not have a hearing loss\",\n        1: \"Both parents have hearing loss\",\n        2: \"Mother has hearing loss\",\n        3: \"Father has hearing loss\",\n        4: \"Unknown\"}",
      "execution_count": 32,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic['parent_hearing_loss'] = demographic.parent_hl.replace(parent_hl_lookup)",
      "execution_count": 33,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Secondary diagnosis"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.shape",
      "execution_count": 34,
      "outputs": [
        {
          "data": {
            "text/plain": "(18551, 59)"
          },
          "execution_count": 34,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic['secondary_diagnosis'] = demographic.etiology==0\n# Suspected or unknown treated as missing\ndemographic.loc[demographic.etiology > 1, 'secondary_diagnosis'] = None",
      "execution_count": 35,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.secondary_diagnosis.value_counts()",
      "execution_count": 36,
      "outputs": [
        {
          "data": {
            "text/plain": "0.0    13659\n1.0     2960\nName: secondary_diagnosis, dtype: int64"
          },
          "execution_count": 36,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.secondary_diagnosis.mean()",
      "execution_count": 37,
      "outputs": [
        {
          "data": {
            "text/plain": "0.17810939286358987"
          },
          "execution_count": 37,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Premature status was recoded to True (premature) and False (full-term). Here, premature indicates <36 weeks."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic['premature_weeks'] = demographic.premature_age.copy()\ndemographic.loc[demographic.premature_age==9, 'premature_weeks'] = None\ndemographic.premature_weeks = abs(demographic.premature_weeks-8)*2\nprint(\"There are {0} null values for premature_weeks\".format(sum(demographic.premature_weeks.isnull())))",
      "execution_count": 38,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "There are 3497 null values for premature_weeks\n"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.premature_weeks.value_counts()",
      "execution_count": 39,
      "outputs": [
        {
          "data": {
            "text/plain": "0.0     12994\n2.0       754\n4.0       472\n12.0      229\n6.0       211\n10.0      195\n8.0       147\n14.0       47\n16.0        5\nName: premature_weeks, dtype: int64"
          },
          "execution_count": 39,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Recode impant technology variables for each ear to one of four categories (None, Baha, Hearing aid, Cochlear implant):"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.tech_ad.value_counts()",
      "execution_count": 40,
      "outputs": [
        {
          "data": {
            "text/plain": "1.0     6460\n0.0     5461\n7.0     2040\n5.0     1279\n2.0      727\n6.0      503\n8.0      100\n9.0       86\n4.0       42\n3.0       28\n10.0      11\nName: tech_ad, dtype: int64"
          },
          "execution_count": 40,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "tech_cats = [\"None\", \"OAD\", \"Hearing aid\", \"Cochlear\", \"Other\"]\n\ndemographic[\"tech_right\"] = 4\ndemographic.loc[demographic.tech_ad==7, 'tech_right'] = 0\ndemographic.loc[demographic.tech_ad==3, 'tech_right'] = 1\ndemographic.loc[demographic.tech_ad.isin([1,2,4,5,10]), 'tech_right'] = 2\ndemographic.loc[demographic.tech_ad.isin([0,8,6]), 'tech_right'] = 3\ndemographic.loc[demographic.tech_ad.isnull(), 'tech_right'] = None\n\ndemographic[\"tech_left\"] = 4\ndemographic.loc[demographic.tech_as==7, 'tech_left'] = 0\ndemographic.loc[demographic.tech_as==3, 'tech_left'] = 1\ndemographic.loc[demographic.tech_as.isin([1,2,4,5,10]), 'tech_left'] = 2\ndemographic.loc[demographic.tech_as.isin([0,8,6]), 'tech_left'] = 3\ndemographic.loc[demographic.tech_as.isnull(), 'tech_left'] = None",
      "execution_count": 41,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.tech_left.value_counts()",
      "execution_count": 42,
      "outputs": [
        {
          "data": {
            "text/plain": "2.0    8656\n3.0    5542\n0.0    2412\n4.0      68\n1.0      21\nName: tech_left, dtype: int64"
          },
          "execution_count": 42,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.tech_right.value_counts()",
      "execution_count": 43,
      "outputs": [
        {
          "data": {
            "text/plain": "2.0    8519\n3.0    6064\n0.0    2040\n4.0      86\n1.0      28\nName: tech_right, dtype: int64"
          },
          "execution_count": 43,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Substitute valid missing values for hearing loss:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.loc[demographic.type_hl_ad==5, 'type_hl_ad'] = None\ndemographic.loc[demographic.type_hl_as==5, 'type_hl_ad'] = None",
      "execution_count": 44,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Create `degree_hl`, which is the maximum level of hearing loss in either ear:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic[\"degree_hl\"] = np.maximum(demographic.degree_hl_ad, demographic.degree_hl_as)",
      "execution_count": 45,
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": "/Users/fonnescj/anaconda3/envs/dev/lib/python3.6/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in maximum\n  \"\"\"Entry point for launching an IPython kernel.\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Create compound indicator variable for each technology (Baha, Hearing aid, Chochlear implant): \n\n* 0=none\n* 1=one ear\n* 2=both ears."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic[\"oad\"] = 0\ndemographic.oad = demographic.oad.astype(object)\ndemographic.loc[(demographic.tech_right==1) | (demographic.tech_left==1), 'oad'] = 1\ndemographic.loc[(demographic.tech_right==1) & (demographic.tech_left==1), 'oad'] = 2\ndemographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'oad'] = None\nprint(\"oad:\")\nprint(demographic.drop_duplicates(subset='study_id').oad.value_counts())\nprint(\"There are {0} null values for OAD\".format(sum(demographic.oad.isnull())))\n\ndemographic[\"hearing_aid\"] = 0\ndemographic.hearing_aid = demographic.hearing_aid.astype(object)\ndemographic.loc[(demographic.tech_right==2) | (demographic.tech_left==2), 'hearing_aid'] = 1\ndemographic.loc[(demographic.tech_right==2) & (demographic.tech_left==2), 'hearing_aid'] = 2\ndemographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_right.isnull()), 'hearing_aid'] = None\nprint(\"\\nhearing_aid:\")\nprint(demographic.drop_duplicates(subset='study_id').hearing_aid.value_counts())\nprint(\"There are {0} null values for hearing_aid\".format(sum(demographic.hearing_aid.isnull())))\n\ndemographic[\"cochlear\"] = 0\ndemographic.cochlear = demographic.cochlear.astype(object)\ndemographic.loc[(demographic.tech_right==3) | (demographic.tech_left==3), 'cochlear'] = 1\ndemographic.loc[(demographic.tech_right==3) & (demographic.tech_left==3), 'cochlear'] = 2\ndemographic.loc[(demographic.tech_right.isnull()) & (demographic.tech_left.isnull()), 'cochlear'] = None\nprint(\"\\ncochlear:\")\nprint(demographic.drop_duplicates(subset='study_id').cochlear.value_counts())\nprint(\"There are {0} null values for cochlear\".format(sum(demographic.cochlear.isnull())))\nprint(len(demographic))",
      "execution_count": 46,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "oad:\n0    5842\n1       4\n2       2\nName: oad, dtype: int64\nThere are 1764 null values for OAD\n\nhearing_aid:\n2    2771\n0    2022\n1    1024\nName: hearing_aid, dtype: int64\nThere are 1814 null values for hearing_aid\n\ncochlear:\n0    3993\n2    1135\n1     720\nName: cochlear, dtype: int64\nThere are 1764 null values for cochlear\n18551\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Identify bilateral and bimodal individuals:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic[\"unilateral_ci\"] = demographic.cochlear==1\ndemographic[\"bilateral_ci\"] = demographic.cochlear==2\ndemographic[\"unilateral_ha\"] = demographic.hearing_aid==1\ndemographic[\"bilateral_ha\"] = demographic.hearing_aid==2\ndemographic[\"bimodal\"] = (demographic.cochlear==1) & (demographic.hearing_aid==1)",
      "execution_count": 47,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.bilateral_ci.sum(), demographic.bilateral_ha.sum(), demographic.bimodal.sum(), demographic.unilateral_ci.sum()",
      "execution_count": 48,
      "outputs": [
        {
          "data": {
            "text/plain": "(4561, 7014, 1711, 2484)"
          },
          "execution_count": 48,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.drop_duplicates(subset='study_id')[['unilateral_ci','bilateral_ci', \n                                               'bilateral_ha',\n                                               'bimodal']].sum()",
      "execution_count": 49,
      "outputs": [
        {
          "data": {
            "text/plain": "unilateral_ci     720\nbilateral_ci     1135\nbilateral_ha     2771\nbimodal           440\ndtype: int64"
          },
          "execution_count": 49,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Create variable that identifies bilateral (0), bilateral HA left (1), bilateral HA right (2)"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic['tech'] = 0\ndemographic.loc[(demographic.bimodal) & (demographic.tech_left==2), 'tech'] = 1\ndemographic.loc[(demographic.bimodal) & (demographic.tech_right==2), 'tech'] = 2\nprint(\"There are {0} null values for tech\".format(sum(demographic.tech.isnull())))",
      "execution_count": 50,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "There are 0 null values for tech\n"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic[\"implant_category\"] = None\ndemographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==0), \n                'implant_category'] = 0\ndemographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==0), \n                'implant_category'] = 1\ndemographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==1), \n                'implant_category'] = 2\ndemographic.loc[(demographic.cochlear==2) & (demographic.hearing_aid==0) & (demographic.oad==0), \n                'implant_category'] = 3\ndemographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==1) & (demographic.oad==0), \n                'implant_category'] = 4\ndemographic.loc[(demographic.cochlear==1) & (demographic.hearing_aid==0) & (demographic.oad==1), \n                'implant_category'] = 5\ndemographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==2) & (demographic.oad==0), \n                'implant_category'] = 6\ndemographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==1) & (demographic.oad==1), \n                'implant_category'] = 7\ndemographic.loc[(demographic.cochlear==0) & (demographic.hearing_aid==0) & (demographic.oad==2), \n                'implant_category'] = 8\ndemographic.implant_category.value_counts()",
      "execution_count": 51,
      "outputs": [
        {
          "data": {
            "text/plain": "6    7014\n3    4561\n4    1711\n1    1411\n0     753\n8      16\n2      11\n7       5\n5       1\nName: implant_category, dtype: int64"
          },
          "execution_count": 51,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "**Age when hearing loss diagnosed** Data are entered inconsistently here, so we have to go in and replace non-numeric values."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Don't need this anymore\n# demographic['age_diag'] = demographic.onset_1.replace({'birth': 0, 'R- Birth L-16mo': 0, 'birth - 3': 0, 'at birth': 0, 'NBHS': 0, \n#                              'at Birth': 0, '1-2': 1.5, '2-3': 2.5, '0-3': 1.5}).astype(float)\ndemographic['age_diag'] = demographic.onset_1",
      "execution_count": 52,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Number of null values for `age_diag`"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.age_diag.isnull().sum()",
      "execution_count": 53,
      "outputs": [
        {
          "data": {
            "text/plain": "3928"
          },
          "execution_count": 53,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic['sex'] = demographic.male.replace({0:'Female', 1:'Male'})",
      "execution_count": 54,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Child has another diagnosed disability"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic['known_synd'] = (demographic.synd_cause == 0)\n# Unknown or suspected\ndemographic.loc[demographic.synd_cause > 1, 'known_synd'] = None",
      "execution_count": 55,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# If either known syndrome or secondary diagnosis\ndemographic['synd_or_disab'] = demographic.apply(lambda x: x['secondary_diagnosis'] or x['known_synd'], axis=1)",
      "execution_count": 56,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Missing sibling counts were properly encoded as `None` (missing)."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.loc[demographic.sib==4, 'sib'] = None",
      "execution_count": 57,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "We reduced the number of race categories, pooling those that were neither caucasian, black, hispanic or asian to \"other\", due to small sample sizes for these categories. Category 7 (unknown) was recoded as missing."
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "races = [\"Caucasian\", \"Black or African American\", \"Hispanic or Latino\", \"Asian\", \"Other\"]\ndemographic = demographic.rename(columns={\"race\":\"_race\"})\ndemographic[\"race\"] = demographic._race.copy()\ndemographic.loc[demographic.race==7, 'race'] = None\ndemographic.loc[demographic.race>3, 'race'] = 4\nprint(\"_race:\")\nprint(demographic._race.value_counts())\nprint(\"race:\")\nprint(demographic.race.value_counts())\nprint(\"There are {0} null values for race\".format(sum(demographic.race.isnull())))\n# Replace with recoded column",
      "execution_count": 58,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "_race:\n0.0    9520\n2.0    3376\n1.0    1711\n3.0    1371\n6.0     980\n8.0     640\n7.0     316\n4.0      76\n5.0      52\nName: _race, dtype: int64\nrace:\n0.0    9520\n2.0    3376\n4.0    1748\n1.0    1711\n3.0    1371\nName: race, dtype: int64\nThere are 825 null values for race\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Recode implant technology variables"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "tech_cats = [\"None\", \"Baha\", \"Hearing aid\", \"Cochlear\", \"Other\"]\n\ndemographic[\"tech_right\"] = demographic.tech_ad.copy()\ndemographic.loc[demographic.tech_right==6, 'tech_right'] = 0\ndemographic.loc[demographic.tech_right==4, 'tech_right'] = 1\ndemographic.loc[demographic.tech_right==5, 'tech_right'] = 1\ndemographic.loc[demographic.tech_right==3, 'tech_right'] = 2\ndemographic.loc[demographic.tech_right==7, 'tech_right'] = 3\ndemographic.loc[demographic.tech_right==8, 'tech_right'] = 3\ndemographic.loc[demographic.tech_right==9, 'tech_right'] = 4\ndemographic.tech_right = np.abs(demographic.tech_right - 3)\n\ndemographic[\"tech_left\"] = demographic.tech_as.copy()\ndemographic.loc[demographic.tech_left==6, 'tech_left'] = 0\ndemographic.loc[demographic.tech_left==4, 'tech_left'] = 1\ndemographic.loc[demographic.tech_left==5, 'tech_left'] = 1\ndemographic.loc[demographic.tech_left==3, 'tech_left'] = 2\ndemographic.loc[demographic.tech_left==7, 'tech_left'] = 3\ndemographic.loc[demographic.tech_left==8, 'tech_left'] = 3\ndemographic.loc[demographic.tech_left==9, 'tech_left'] = 4\ndemographic.tech_left = np.abs(demographic.tech_left - 3)",
      "execution_count": 59,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "demographic.to_csv(DATA_DIR+'demographics.csv')",
      "execution_count": 60,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Cleaning expressive vocabulary dataset\n\nWe converted the expressive vocabulary dataset to \"long\" format:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Test type\nexpressive[\"test_type\"] = None\nEOWPVT = expressive.eowpvt_ss.notnull()\nEVT = expressive.evt_ss.notnull()\nexpressive = expressive[EOWPVT | EVT]\nexpressive.loc[EOWPVT & EVT, \"test_type\"] = \"EOWPVT and EVT\"\nexpressive.loc[EOWPVT & ~EVT, \"test_type\"] = \"EOWPVT\"\nexpressive.loc[~EOWPVT & EVT, \"test_type\"] = \"EVT\"\nprint(\"There are {0} null values for test_type\".format(sum(expressive[\"test_type\"].isnull())))\n\nexpressive[\"score\"] = expressive.eowpvt_ss\nexpressive.loc[~EOWPVT & EVT, \"score\"] = expressive.evt_ss[~EOWPVT & EVT]",
      "execution_count": 61,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "There are 0 null values for test_type\n"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "expressive.test_type.value_counts()",
      "execution_count": 62,
      "outputs": [
        {
          "data": {
            "text/plain": "EVT               4666\nEOWPVT            3290\nEOWPVT and EVT     205\nName: test_type, dtype: int64"
          },
          "execution_count": 62,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "A `school` variable was added, which is the first four columns of the `study_id`:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "expressive[\"school\"] = expressive.study_id.str.slice(0,4)",
      "execution_count": 63,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "The age was taken to be the EOWPVT age if there are both test types:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "expressive[\"age_test\"] = expressive.age_test_eowpvt\nexpressive.loc[expressive.age_test.isnull(), 'age_test'] = expressive.age_test_evt[expressive.age_test.isnull()]",
      "execution_count": 64,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Finally, we dropped unwanted columns and added a domain identification column for merging:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# expressive = expressive[[\"study_id\", \"redcap_event_name\", \"score\", \"test_type\", \"school\", \"age_test\"]]\nexpressive[\"domain\"] = \"Expressive Vocabulary\"\nexpressive.head()",
      "execution_count": 65,
      "outputs": [
        {
          "data": {
            "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>study_id</th>\n      <th>redcap_event_name</th>\n      <th>age_test_eowpvt</th>\n      <th>eowpvt_ss</th>\n      <th>age_test_evt</th>\n      <th>evt_ss</th>\n      <th>test_type</th>\n      <th>score</th>\n      <th>school</th>\n      <th>age_test</th>\n      <th>domain</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>0101-2002-0101</td>\n      <td>initial_assessment_arm_1</td>\n      <td>54.0</td>\n      <td>58.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>EOWPVT</td>\n      <td>58.0</td>\n      <td>0101</td>\n      <td>54.0</td>\n      <td>Expressive Vocabulary</td>\n    </tr>\n    <tr>\n      <th>2</th>\n      <td>0101-2002-0101</td>\n      <td>year_2_complete_71_arm_1</td>\n      <td>80.0</td>\n      <td>84.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>EOWPVT</td>\n      <td>84.0</td>\n      <td>0101</td>\n      <td>80.0</td>\n      <td>Expressive Vocabulary</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>0101-2002-0101</td>\n      <td>year_5_complete_71_arm_1</td>\n      <td>113.0</td>\n      <td>90.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>EOWPVT</td>\n      <td>90.0</td>\n      <td>0101</td>\n      <td>113.0</td>\n      <td>Expressive Vocabulary</td>\n    </tr>\n    <tr>\n      <th>14</th>\n      <td>0101-2004-0101</td>\n      <td>year_2_complete_71_arm_1</td>\n      <td>53.0</td>\n      <td>90.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>EOWPVT</td>\n      <td>90.0</td>\n      <td>0101</td>\n      <td>53.0</td>\n      <td>Expressive Vocabulary</td>\n    </tr>\n    <tr>\n      <th>15</th>\n      <td>0101-2004-0101</td>\n      <td>year_3_complete_71_arm_1</td>\n      <td>66.0</td>\n      <td>87.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>EOWPVT</td>\n      <td>87.0</td>\n      <td>0101</td>\n      <td>66.0</td>\n      <td>Expressive Vocabulary</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
            "text/plain": "          study_id         redcap_event_name  age_test_eowpvt  eowpvt_ss  \\\n0   0101-2002-0101  initial_assessment_arm_1             54.0       58.0   \n2   0101-2002-0101  year_2_complete_71_arm_1             80.0       84.0   \n5   0101-2002-0101  year_5_complete_71_arm_1            113.0       90.0   \n14  0101-2004-0101  year_2_complete_71_arm_1             53.0       90.0   \n15  0101-2004-0101  year_3_complete_71_arm_1             66.0       87.0   \n\n    age_test_evt  evt_ss test_type  score school  age_test  \\\n0            NaN     NaN    EOWPVT   58.0   0101      54.0   \n2            NaN     NaN    EOWPVT   84.0   0101      80.0   \n5            NaN     NaN    EOWPVT   90.0   0101     113.0   \n14           NaN     NaN    EOWPVT   90.0   0101      53.0   \n15           NaN     NaN    EOWPVT   87.0   0101      66.0   \n\n                   domain  \n0   Expressive Vocabulary  \n2   Expressive Vocabulary  \n5   Expressive Vocabulary  \n14  Expressive Vocabulary  \n15  Expressive Vocabulary  "
          },
          "execution_count": 65,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "expressive.to_csv(DATA_DIR+'expressive_vocabulary.csv')",
      "execution_count": 66,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Cleaning receptive vocabulary dataset"
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "We converted the receptive vocabulary data table to \"long\" format:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# Test type\nreceptive[\"test_type\"] = None\nPPVT = receptive.ppvt_ss.notnull()\nROWPVT = receptive.rowpvt_ss.notnull()\nreceptive = receptive[PPVT | ROWPVT]\nreceptive.loc[PPVT & ROWPVT, \"test_type\"] = \"PPVT and ROWPVT\"\nreceptive.loc[PPVT & ~ROWPVT, \"test_type\"] = \"PPVT\"\nreceptive.loc[~PPVT & ROWPVT, \"test_type\"] = \"ROWPVT\"\nprint(\"There are {0} null values for test_type\".format(sum(receptive[\"test_type\"].isnull())))\n\nreceptive[\"score\"] = receptive.ppvt_ss\nreceptive.loc[~PPVT & ROWPVT, \"score\"] = receptive.rowpvt_ss[~PPVT & ROWPVT]",
      "execution_count": 67,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "There are 0 null values for test_type\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "A `school` variable was added, which is the first four columns of the `study_id`:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "receptive[\"school\"] = receptive.study_id.str.slice(0,4)",
      "execution_count": 68,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "The age was taken to be the PPVT age if there are both test types:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "receptive[\"age_test\"] = receptive.age_test_ppvt\nreceptive.loc[receptive.age_test.isnull(), 'age_test'] = receptive.age_test_rowpvt[receptive.age_test.isnull()]",
      "execution_count": 69,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "print(\"There are {0} null values for age_test\".format(sum(receptive.age_test.isnull())))",
      "execution_count": 70,
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": "There are 25 null values for age_test\n"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Finally, we dropped unwanted columns and added a domain identification column for merging:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "# receptive = receptive[[\"study_id\", \"redcap_event_name\", \"score\", \"test_type\", \"school\", \"age_test\"]]\nreceptive[\"domain\"] = \"Receptive Vocabulary\"\nreceptive.head()",
      "execution_count": 71,
      "outputs": [
        {
          "data": {
            "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>study_id</th>\n      <th>redcap_event_name</th>\n      <th>age_test_ppvt</th>\n      <th>ppvt_ss</th>\n      <th>age_test_rowpvt</th>\n      <th>rowpvt_ss</th>\n      <th>test_type</th>\n      <th>score</th>\n      <th>school</th>\n      <th>age_test</th>\n      <th>domain</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>2</th>\n      <td>0101-2002-0101</td>\n      <td>year_2_complete_71_arm_1</td>\n      <td>80.0</td>\n      <td>90.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>PPVT</td>\n      <td>90.0</td>\n      <td>0101</td>\n      <td>80.0</td>\n      <td>Receptive Vocabulary</td>\n    </tr>\n    <tr>\n      <th>5</th>\n      <td>0101-2002-0101</td>\n      <td>year_5_complete_71_arm_1</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>113.0</td>\n      <td>101.0</td>\n      <td>ROWPVT</td>\n      <td>101.0</td>\n      <td>0101</td>\n      <td>113.0</td>\n      <td>Receptive Vocabulary</td>\n    </tr>\n    <tr>\n      <th>9</th>\n      <td>0101-2003-0102</td>\n      <td>initial_assessment_arm_1</td>\n      <td>44.0</td>\n      <td>55.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>PPVT</td>\n      <td>55.0</td>\n      <td>0101</td>\n      <td>44.0</td>\n      <td>Receptive Vocabulary</td>\n    </tr>\n    <tr>\n      <th>10</th>\n      <td>0101-2003-0102</td>\n      <td>year_1_complete_71_arm_1</td>\n      <td>54.0</td>\n      <td>80.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>PPVT</td>\n      <td>80.0</td>\n      <td>0101</td>\n      <td>54.0</td>\n      <td>Receptive Vocabulary</td>\n    </tr>\n    <tr>\n      <th>11</th>\n      <td>0101-2003-0102</td>\n      <td>year_2_complete_71_arm_1</td>\n      <td>68.0</td>\n      <td>101.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>PPVT</td>\n      <td>101.0</td>\n      <td>0101</td>\n      <td>68.0</td>\n      <td>Receptive Vocabulary</td>\n    </tr>\n  </tbody>\n</table>\n</div>",
            "text/plain": "          study_id         redcap_event_name  age_test_ppvt  ppvt_ss  \\\n2   0101-2002-0101  year_2_complete_71_arm_1           80.0     90.0   \n5   0101-2002-0101  year_5_complete_71_arm_1            NaN      NaN   \n9   0101-2003-0102  initial_assessment_arm_1           44.0     55.0   \n10  0101-2003-0102  year_1_complete_71_arm_1           54.0     80.0   \n11  0101-2003-0102  year_2_complete_71_arm_1           68.0    101.0   \n\n    age_test_rowpvt  rowpvt_ss test_type  score school  age_test  \\\n2               NaN        NaN      PPVT   90.0   0101      80.0   \n5             113.0      101.0    ROWPVT  101.0   0101     113.0   \n9               NaN        NaN      PPVT   55.0   0101      44.0   \n10              NaN        NaN      PPVT   80.0   0101      54.0   \n11              NaN        NaN      PPVT  101.0   0101      68.0   \n\n                  domain  \n2   Receptive Vocabulary  \n5   Receptive Vocabulary  \n9   Receptive Vocabulary  \n10  Receptive Vocabulary  \n11  Receptive Vocabulary  "
          },
          "execution_count": 71,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "scrolled": true,
        "trusted": true
      },
      "cell_type": "code",
      "source": "receptive.study_id.unique().shape",
      "execution_count": 72,
      "outputs": [
        {
          "data": {
            "text/plain": "(3630,)"
          },
          "execution_count": 72,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "receptive.to_csv(DATA_DIR+'receptive_vocabulary.csv')",
      "execution_count": 73,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "## Merge datasets\n\nThe four datasets were mereged into a single table. First, we concatenate the test scores data:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "test_scores = pd.concat([articulation, expressive, receptive, language])",
      "execution_count": 74,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Then we perform a merge between the demographic data and the test scores data:"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "lsl_dr = pd.merge(demographic, test_scores, on=[\"study_id\", \"redcap_event_name\"], how='left')",
      "execution_count": 75,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "lsl_dr.tail()",
      "execution_count": 76,
      "outputs": [
        {
          "data": {
            "text/html": "<div>\n<style>\n    .dataframe thead tr:only-child th {\n        text-align: right;\n    }\n\n    .dataframe thead th {\n        text-align: left;\n    }\n\n    .dataframe tbody tr th {\n        vertical-align: top;\n    }\n</style>\n<table border=\"1\" class=\"dataframe\">\n  <thead>\n    <tr style=\"text-align: right;\">\n      <th></th>\n      <th>redcap_event_name</th>\n      <th>academic_year_rv</th>\n      <th>hl</th>\n      <th>male</th>\n      <th>_race</th>\n      <th>prim_lang</th>\n      <th>sib</th>\n      <th>_mother_ed</th>\n      <th>father_ed</th>\n      <th>parent_hl</th>\n      <th>...</th>\n      <th>domain</th>\n      <th>eowpvt_ss</th>\n      <th>evt_ss</th>\n      <th>gf2_ss</th>\n      <th>ppvt_ss</th>\n      <th>rowpvt_ss</th>\n      <th>school</th>\n      <th>score</th>\n      <th>test_name</th>\n      <th>test_type</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>47072</th>\n      <td>year_9_complete_71_arm_1</td>\n      <td>2015.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>6.0</td>\n      <td>6.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>Receptive Vocabulary</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>97.0</td>\n      <td>NaN</td>\n      <td>0102</td>\n      <td>97</td>\n      <td>NaN</td>\n      <td>PPVT</td>\n    </tr>\n    <tr>\n      <th>47073</th>\n      <td>year_9_complete_71_arm_1</td>\n      <td>2012.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>4.0</td>\n      <td>4.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>Articulation</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0735</td>\n      <td>75</td>\n      <td>NaN</td>\n      <td>Arizonia</td>\n    </tr>\n    <tr>\n      <th>47074</th>\n      <td>year_9_complete_71_arm_1</td>\n      <td>2012.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>4.0</td>\n      <td>4.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>Expressive Vocabulary</td>\n      <td>NaN</td>\n      <td>79.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0735</td>\n      <td>79</td>\n      <td>NaN</td>\n      <td>EVT</td>\n    </tr>\n    <tr>\n      <th>47075</th>\n      <td>year_9_complete_71_arm_1</td>\n      <td>2012.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>4.0</td>\n      <td>4.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>Receptive Vocabulary</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>73.0</td>\n      <td>NaN</td>\n      <td>0735</td>\n      <td>73</td>\n      <td>NaN</td>\n      <td>PPVT</td>\n    </tr>\n    <tr>\n      <th>47076</th>\n      <td>year_9_complete_71_arm_1</td>\n      <td>2012.0</td>\n      <td>0.0</td>\n      <td>1.0</td>\n      <td>0.0</td>\n      <td>0.0</td>\n      <td>3.0</td>\n      <td>4.0</td>\n      <td>4.0</td>\n      <td>0.0</td>\n      <td>...</td>\n      <td>Articulation</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>100.0</td>\n      <td>NaN</td>\n      <td>NaN</td>\n      <td>0521</td>\n      <td>100</td>\n      <td>NaN</td>\n      <td>Goldman</td>\n    </tr>\n  </tbody>\n</table>\n<p>5 rows × 97 columns</p>\n</div>",
            "text/plain": "              redcap_event_name  academic_year_rv   hl  male  _race  \\\n47072  year_9_complete_71_arm_1            2015.0  0.0   0.0    0.0   \n47073  year_9_complete_71_arm_1            2012.0  0.0   0.0    0.0   \n47074  year_9_complete_71_arm_1            2012.0  0.0   0.0    0.0   \n47075  year_9_complete_71_arm_1            2012.0  0.0   0.0    0.0   \n47076  year_9_complete_71_arm_1            2012.0  0.0   1.0    0.0   \n\n       prim_lang  sib  _mother_ed  father_ed  parent_hl    ...      \\\n47072        0.0  1.0         6.0        6.0        0.0    ...       \n47073        0.0  3.0         4.0        4.0        0.0    ...       \n47074        0.0  3.0         4.0        4.0        0.0    ...       \n47075        0.0  3.0         4.0        4.0        0.0    ...       \n47076        0.0  3.0         4.0        4.0        0.0    ...       \n\n                      domain  eowpvt_ss  evt_ss  gf2_ss  ppvt_ss  rowpvt_ss  \\\n47072   Receptive Vocabulary        NaN     NaN     NaN     97.0        NaN   \n47073           Articulation        NaN     NaN     NaN      NaN        NaN   \n47074  Expressive Vocabulary        NaN    79.0     NaN      NaN        NaN   \n47075   Receptive Vocabulary        NaN     NaN     NaN     73.0        NaN   \n47076           Articulation        NaN     NaN   100.0      NaN        NaN   \n\n       school  score  test_name  test_type  \n47072    0102     97        NaN       PPVT  \n47073    0735     75        NaN   Arizonia  \n47074    0735     79        NaN        EVT  \n47075    0735     73        NaN       PPVT  \n47076    0521    100        NaN    Goldman  \n\n[5 rows x 97 columns]"
          },
          "execution_count": 76,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Convert score to floating-point number"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "lsl_dr.score = lsl_dr.score.astype(float)",
      "execution_count": 77,
      "outputs": []
    },
    {
      "metadata": {},
      "cell_type": "markdown",
      "source": "Export dataset"
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "lsl_dr.to_csv(DATA_DIR+'lsl_dr.csv')",
      "execution_count": 78,
      "outputs": []
    },
    {
      "metadata": {
        "trusted": true
      },
      "cell_type": "code",
      "source": "lsl_dr.shape",
      "execution_count": 79,
      "outputs": [
        {
          "data": {
            "text/plain": "(47077, 97)"
          },
          "execution_count": 79,
          "metadata": {},
          "output_type": "execute_result"
        }
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3",
      "language": "python"
    },
    "language_info": {
      "name": "python",
      "version": "3.6.1",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "widgets": {
      "state": {},
      "version": "1.0.0"
    },
    "gist": {
      "id": "",
      "data": {
        "description": "Dataset Processing.ipynb",
        "public": true
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 1
}