Skip to content

Instantly share code, notes, and snippets.

@yeesian
Last active December 17, 2015 12:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yeesian/5611329 to your computer and use it in GitHub Desktop.
Save yeesian/5611329 to your computer and use it in GitHub Desktop.
A brief tutorial on the cleaning of the data scraped from NUS CORS archive
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "Data_Cleaning_(bidding_summary)"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Cleaning NUS Bidding Summary\n",
"---\n",
"(this is a continuation of the instructions (from step 7 onwards) at the [NUS-Bidding-History](https://github.com/yeesian/NUS-Bidding-History) repository.)\n",
"\n",
"\n",
"libraries used:"
]
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"print('pandas',pd.version.version)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"pandas 0.11.0\n"
]
}
],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary = pd.read_csv('bidding_summary.csv',sep='|')\n",
"bid_summary # notice how there are missing values in the first 2 columns\""
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<pre>\n",
"&ltclass 'pandas.core.frame.DataFrame'&gt\n",
"Int64Index: 175383 entries, 0 to 175382\n",
"Data columns (total 12 columns):\n",
"Module 134943 non-null values\n",
"Group 134943 non-null values\n",
"Quota 175383 non-null values\n",
"No_of_Bidders 175383 non-null values\n",
"Lowest_Bid 175383 non-null values\n",
"Lowest_Succ_Bid 175383 non-null values\n",
"Highest_Bid 175383 non-null values\n",
"Faculty 175383 non-null values\n",
"Student_Type 175383 non-null values\n",
"Bid_Round 175383 non-null values\n",
"Acad_Yr 175383 non-null values\n",
"Semester 175383 non-null values\n",
"dtypes: int64(2), object(10)\n",
"</pre>"
],
"output_type": "pyout",
"prompt_number": 2,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 175383 entries, 0 to 175382\n",
"Data columns (total 12 columns):\n",
"Module 134943 non-null values\n",
"Group 134943 non-null values\n",
"Quota 175383 non-null values\n",
"No_of_Bidders 175383 non-null values\n",
"Lowest_Bid 175383 non-null values\n",
"Lowest_Succ_Bid 175383 non-null values\n",
"Highest_Bid 175383 non-null values\n",
"Faculty 175383 non-null values\n",
"Student_Type 175383 non-null values\n",
"Bid_Round 175383 non-null values\n",
"Acad_Yr 175383 non-null values\n",
"Semester 175383 non-null values\n",
"dtypes: int64(2), object(10)"
]
}
],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(bid_summary.head())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" Module Group Quota No_of_Bidders Lowest_Bid Lowest_Succ_Bid \\\n",
"0 Module Group Quota No Lowest Lowest \n",
"1 ACC1002 LECTURE B1 18 0 0 0 \n",
"2 ACC1002X LECTURE B1 61 2 1 1 \n",
"3 NaN NaN 27 18 1 1 \n",
"4 NaN NaN 45 42 1 1 \n",
"\n",
" Highest_Bid Faculty Student_Type Bid_Round \\\n",
"0 Highest Faculty Student Type 1C \n",
"1 0 SCHOOL OF BUSINESS Returning Students [P] 1C \n",
"2 50 ENGINEERING Returning Students [P] 1C \n",
"3 2264 SCHOOL OF COMPUTING Returning Students [P] 1C \n",
"4 747 ENGINEERING New Students [P] 1C \n",
"\n",
" Acad_Yr Semester \n",
"0 20112012 1 \n",
"1 20112012 1 \n",
"2 20112012 1 \n",
"3 20112012 1 \n",
"4 20112012 1 \n"
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary = bid_summary.fillna(method='pad') # we fill downwards\n",
"bid_summary"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<pre>\n",
"&ltclass 'pandas.core.frame.DataFrame'&gt\n",
"Int64Index: 175383 entries, 0 to 175382\n",
"Data columns (total 12 columns):\n",
"Module 175383 non-null values\n",
"Group 175383 non-null values\n",
"Quota 175383 non-null values\n",
"No_of_Bidders 175383 non-null values\n",
"Lowest_Bid 175383 non-null values\n",
"Lowest_Succ_Bid 175383 non-null values\n",
"Highest_Bid 175383 non-null values\n",
"Faculty 175383 non-null values\n",
"Student_Type 175383 non-null values\n",
"Bid_Round 175383 non-null values\n",
"Acad_Yr 175383 non-null values\n",
"Semester 175383 non-null values\n",
"dtypes: int64(2), object(10)\n",
"</pre>"
],
"output_type": "pyout",
"prompt_number": 4,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 175383 entries, 0 to 175382\n",
"Data columns (total 12 columns):\n",
"Module 175383 non-null values\n",
"Group 175383 non-null values\n",
"Quota 175383 non-null values\n",
"No_of_Bidders 175383 non-null values\n",
"Lowest_Bid 175383 non-null values\n",
"Lowest_Succ_Bid 175383 non-null values\n",
"Highest_Bid 175383 non-null values\n",
"Faculty 175383 non-null values\n",
"Student_Type 175383 non-null values\n",
"Bid_Round 175383 non-null values\n",
"Acad_Yr 175383 non-null values\n",
"Semester 175383 non-null values\n",
"dtypes: int64(2), object(10)"
]
}
],
"prompt_number": 4
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(bid_summary.head()) # now, observe the duplicate header rows (eg. row 0)"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" Module Group Quota No_of_Bidders Lowest_Bid Lowest_Succ_Bid \\\n",
"0 Module Group Quota No Lowest Lowest \n",
"1 ACC1002 LECTURE B1 18 0 0 0 \n",
"2 ACC1002X LECTURE B1 61 2 1 1 \n",
"3 ACC1002X LECTURE B1 27 18 1 1 \n",
"4 ACC1002X LECTURE B1 45 42 1 1 \n",
"\n",
" Highest_Bid Faculty Student_Type Bid_Round \\\n",
"0 Highest Faculty Student Type 1C \n",
"1 0 SCHOOL OF BUSINESS Returning Students [P] 1C \n",
"2 50 ENGINEERING Returning Students [P] 1C \n",
"3 2264 SCHOOL OF COMPUTING Returning Students [P] 1C \n",
"4 747 ENGINEERING New Students [P] 1C \n",
"\n",
" Acad_Yr Semester \n",
"0 20112012 1 \n",
"1 20112012 1 \n",
"2 20112012 1 \n",
"3 20112012 1 \n",
"4 20112012 1 \n"
]
}
],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary = bid_summary[bid_summary['Module'] != 'Module'] # filter out [duplicate] headers\n",
"bid_summary"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<pre>\n",
"&ltclass 'pandas.core.frame.DataFrame'&gt\n",
"Int64Index: 175257 entries, 1 to 175382\n",
"Data columns (total 12 columns):\n",
"Module 175257 non-null values\n",
"Group 175257 non-null values\n",
"Quota 175257 non-null values\n",
"No_of_Bidders 175257 non-null values\n",
"Lowest_Bid 175257 non-null values\n",
"Lowest_Succ_Bid 175257 non-null values\n",
"Highest_Bid 175257 non-null values\n",
"Faculty 175257 non-null values\n",
"Student_Type 175257 non-null values\n",
"Bid_Round 175257 non-null values\n",
"Acad_Yr 175257 non-null values\n",
"Semester 175257 non-null values\n",
"dtypes: int64(2), object(10)\n",
"</pre>"
],
"output_type": "pyout",
"prompt_number": 6,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 175257 entries, 1 to 175382\n",
"Data columns (total 12 columns):\n",
"Module 175257 non-null values\n",
"Group 175257 non-null values\n",
"Quota 175257 non-null values\n",
"No_of_Bidders 175257 non-null values\n",
"Lowest_Bid 175257 non-null values\n",
"Lowest_Succ_Bid 175257 non-null values\n",
"Highest_Bid 175257 non-null values\n",
"Faculty 175257 non-null values\n",
"Student_Type 175257 non-null values\n",
"Bid_Round 175257 non-null values\n",
"Acad_Yr 175257 non-null values\n",
"Semester 175257 non-null values\n",
"dtypes: int64(2), object(10)"
]
}
],
"prompt_number": 6
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"print(bid_summary.head())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
" Module Group Quota No_of_Bidders Lowest_Bid Lowest_Succ_Bid \\\n",
"1 ACC1002 LECTURE B1 18 0 0 0 \n",
"2 ACC1002X LECTURE B1 61 2 1 1 \n",
"3 ACC1002X LECTURE B1 27 18 1 1 \n",
"4 ACC1002X LECTURE B1 45 42 1 1 \n",
"5 ACC1002X LECTURE B1 30 56 51 327 \n",
"\n",
" Highest_Bid Faculty Student_Type Bid_Round \\\n",
"1 0 SCHOOL OF BUSINESS Returning Students [P] 1C \n",
"2 50 ENGINEERING Returning Students [P] 1C \n",
"3 2264 SCHOOL OF COMPUTING Returning Students [P] 1C \n",
"4 747 ENGINEERING New Students [P] 1C \n",
"5 635 SCHOOL OF COMPUTING New Students [P] 1C \n",
"\n",
" Acad_Yr Semester \n",
"1 20112012 1 \n",
"2 20112012 1 \n",
"3 20112012 1 \n",
"4 20112012 1 \n",
"5 20112012 1 \n"
]
}
],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set(bid_summary['Student_Type'])\n",
"\n",
"# the source of a number of problems\n",
"# i) \"NUS Students [P, G]\" has a comma inside\n",
"# ii) 'Returning Students [P] and ' was cropped off\n",
"# iii) 'Returning Students and ' was also cropped off"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 8,
"text": [
"set(['Returning Students [P]',\n",
" 'Returning Students [P] and ',\n",
" 'NUS Students [G]',\n",
" 'NUS Students [P]',\n",
" 'NUS Students [P, G]',\n",
" 'New Students [P]',\n",
" 'Reserved for [G] in later round',\n",
" 'Not Available for [G]',\n",
" 'Returning Students and '])"
]
}
],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"# remove the comma in the field: 'NUS Students [P, G]'\n",
"bid_summary.ix[bid_summary.Student_Type == 'NUS Students [P, G]', 'Student_Type'] = 'NUS Students [PG]'\n",
"set(bid_summary['Student_Type']) # doublecheck, so we can save it as comma-separated values later"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 9,
"text": [
"set(['NUS Students [PG]',\n",
" 'Returning Students [P]',\n",
" 'Returning Students [P] and ',\n",
" 'NUS Students [G]',\n",
" 'NUS Students [P]',\n",
" 'New Students [P]',\n",
" 'Reserved for [G] in later round',\n",
" 'Not Available for [G]',\n",
" 'Returning Students and '])"
]
}
],
"prompt_number": 9
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary.ix[bid_summary.Student_Type == 'Returning Students [P] and ', 'Student_Type'] = 'Returning Students [P] and NUS Students [G]'\n",
"bid_summary.ix[bid_summary.Student_Type == 'Returning Students and ', 'Student_Type'] = 'Returning Students and New Students [P]'\n",
"set(bid_summary['Student_Type']) # doublecheck"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 10,
"text": [
"set(['NUS Students [PG]',\n",
" 'Returning Students [P]',\n",
" 'Returning Students and New Students [P]',\n",
" 'NUS Students [G]',\n",
" 'NUS Students [P]',\n",
" 'New Students [P]',\n",
" 'Reserved for [G] in later round',\n",
" 'Not Available for [G]',\n",
" 'Returning Students [P] and NUS Students [G]'])"
]
}
],
"prompt_number": 10
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"set(bid_summary['Acad_Yr']) # some of the years (0405 and 0506) are inconsistent"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 11,
"text": [
"set([20072008,\n",
" 20092010,\n",
" 20112012,\n",
" 405,\n",
" 20062007,\n",
" 20082009,\n",
" 506,\n",
" 20102011,\n",
" 20122013])"
]
}
],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary.ix[bid_summary.Acad_Yr == 405, 'Acad_Yr'] = 20042005\n",
"bid_summary.ix[bid_summary.Acad_Yr == 506, 'Acad_Yr'] = 20052006\n",
"set(bid_summary['Acad_Yr']) # some of the years are broken"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 12,
"text": [
"set([20052006,\n",
" 20072008,\n",
" 20092010,\n",
" 20112012,\n",
" 20042005,\n",
" 20062007,\n",
" 20082009,\n",
" 20102011,\n",
" 20122013])"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary.dtypes # let's check the datatype for the rest of the fields"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 13,
"text": [
"Module object\n",
"Group object\n",
"Quota object\n",
"No_of_Bidders object\n",
"Lowest_Bid object\n",
"Lowest_Succ_Bid object\n",
"Highest_Bid object\n",
"Faculty object\n",
"Student_Type object\n",
"Bid_Round object\n",
"Acad_Yr int64\n",
"Semester int64\n",
"dtype: object"
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"for header in ['Quota','No_of_Bidders','Lowest_Bid','Lowest_Succ_Bid','Highest_Bid']:\n",
" bid_summary[header] = bid_summary[header].map(int) # convert to Int64\n",
"bid_summary"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<pre>\n",
"&ltclass 'pandas.core.frame.DataFrame'&gt\n",
"Int64Index: 175257 entries, 1 to 175382\n",
"Data columns (total 12 columns):\n",
"Module 175257 non-null values\n",
"Group 175257 non-null values\n",
"Quota 175257 non-null values\n",
"No_of_Bidders 175257 non-null values\n",
"Lowest_Bid 175257 non-null values\n",
"Lowest_Succ_Bid 175257 non-null values\n",
"Highest_Bid 175257 non-null values\n",
"Faculty 175257 non-null values\n",
"Student_Type 175257 non-null values\n",
"Bid_Round 175257 non-null values\n",
"Acad_Yr 175257 non-null values\n",
"Semester 175257 non-null values\n",
"dtypes: int64(7), object(5)\n",
"</pre>"
],
"output_type": "pyout",
"prompt_number": 14,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 175257 entries, 1 to 175382\n",
"Data columns (total 12 columns):\n",
"Module 175257 non-null values\n",
"Group 175257 non-null values\n",
"Quota 175257 non-null values\n",
"No_of_Bidders 175257 non-null values\n",
"Lowest_Bid 175257 non-null values\n",
"Lowest_Succ_Bid 175257 non-null values\n",
"Highest_Bid 175257 non-null values\n",
"Faculty 175257 non-null values\n",
"Student_Type 175257 non-null values\n",
"Bid_Round 175257 non-null values\n",
"Acad_Yr 175257 non-null values\n",
"Semester 175257 non-null values\n",
"dtypes: int64(7), object(5)"
]
}
],
"prompt_number": 14
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary.dtypes"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "pyout",
"prompt_number": 15,
"text": [
"Module object\n",
"Group object\n",
"Quota int64\n",
"No_of_Bidders int64\n",
"Lowest_Bid int64\n",
"Lowest_Succ_Bid int64\n",
"Highest_Bid int64\n",
"Faculty object\n",
"Student_Type object\n",
"Bid_Round object\n",
"Acad_Yr int64\n",
"Semester int64\n",
"dtype: object"
]
}
],
"prompt_number": 15
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary.to_csv('nus_bidding_summary.csv',index=False)"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 16
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"bid_summary = pd.read_csv('nus_bidding_summary.csv')\n",
"bid_summary # and we're done"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<pre>\n",
"&ltclass 'pandas.core.frame.DataFrame'&gt\n",
"Int64Index: 175257 entries, 0 to 175256\n",
"Data columns (total 12 columns):\n",
"Module 175257 non-null values\n",
"Group 175257 non-null values\n",
"Quota 175257 non-null values\n",
"No_of_Bidders 175257 non-null values\n",
"Lowest_Bid 175257 non-null values\n",
"Lowest_Succ_Bid 175257 non-null values\n",
"Highest_Bid 175257 non-null values\n",
"Faculty 175257 non-null values\n",
"Student_Type 175257 non-null values\n",
"Bid_Round 175257 non-null values\n",
"Acad_Yr 175257 non-null values\n",
"Semester 175257 non-null values\n",
"dtypes: int64(7), object(5)\n",
"</pre>"
],
"output_type": "pyout",
"prompt_number": 17,
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 175257 entries, 0 to 175256\n",
"Data columns (total 12 columns):\n",
"Module 175257 non-null values\n",
"Group 175257 non-null values\n",
"Quota 175257 non-null values\n",
"No_of_Bidders 175257 non-null values\n",
"Lowest_Bid 175257 non-null values\n",
"Lowest_Succ_Bid 175257 non-null values\n",
"Highest_Bid 175257 non-null values\n",
"Faculty 175257 non-null values\n",
"Student_Type 175257 non-null values\n",
"Bid_Round 175257 non-null values\n",
"Acad_Yr 175257 non-null values\n",
"Semester 175257 non-null values\n",
"dtypes: int64(7), object(5)"
]
}
],
"prompt_number": 17
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment