Last active
December 17, 2015 12:39
-
-
Save yeesian/5611329 to your computer and use it in GitHub Desktop.
A brief tutorial on the cleaning of the data scraped from NUS CORS archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Data_Cleaning_(bidding_summary)" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Cleaning NUS Bidding Summary\n", | |
"---\n", | |
"(this is a continuation of the instructions (from step 7 onwards) at the [NUS-Bidding-History](https://github.com/yeesian/NUS-Bidding-History) repository.)\n", | |
"\n", | |
"\n", | |
"libraries used:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"import pandas as pd\n", | |
"print('pandas',pd.version.version)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
"pandas 0.11.0\n" | |
] | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary = pd.read_csv('bidding_summary.csv',sep='|')\n", | |
"bid_summary # notice how there are missing values in the first 2 columns\"" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<pre>\n", | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175383 entries, 0 to 175382\n", | |
"Data columns (total 12 columns):\n", | |
"Module 134943 non-null values\n", | |
"Group 134943 non-null values\n", | |
"Quota 175383 non-null values\n", | |
"No_of_Bidders 175383 non-null values\n", | |
"Lowest_Bid 175383 non-null values\n", | |
"Lowest_Succ_Bid 175383 non-null values\n", | |
"Highest_Bid 175383 non-null values\n", | |
"Faculty 175383 non-null values\n", | |
"Student_Type 175383 non-null values\n", | |
"Bid_Round 175383 non-null values\n", | |
"Acad_Yr 175383 non-null values\n", | |
"Semester 175383 non-null values\n", | |
"dtypes: int64(2), object(10)\n", | |
"</pre>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175383 entries, 0 to 175382\n", | |
"Data columns (total 12 columns):\n", | |
"Module 134943 non-null values\n", | |
"Group 134943 non-null values\n", | |
"Quota 175383 non-null values\n", | |
"No_of_Bidders 175383 non-null values\n", | |
"Lowest_Bid 175383 non-null values\n", | |
"Lowest_Succ_Bid 175383 non-null values\n", | |
"Highest_Bid 175383 non-null values\n", | |
"Faculty 175383 non-null values\n", | |
"Student_Type 175383 non-null values\n", | |
"Bid_Round 175383 non-null values\n", | |
"Acad_Yr 175383 non-null values\n", | |
"Semester 175383 non-null values\n", | |
"dtypes: int64(2), object(10)" | |
] | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print(bid_summary.head())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Module Group Quota No_of_Bidders Lowest_Bid Lowest_Succ_Bid \\\n", | |
"0 Module Group Quota No Lowest Lowest \n", | |
"1 ACC1002 LECTURE B1 18 0 0 0 \n", | |
"2 ACC1002X LECTURE B1 61 2 1 1 \n", | |
"3 NaN NaN 27 18 1 1 \n", | |
"4 NaN NaN 45 42 1 1 \n", | |
"\n", | |
" Highest_Bid Faculty Student_Type Bid_Round \\\n", | |
"0 Highest Faculty Student Type 1C \n", | |
"1 0 SCHOOL OF BUSINESS Returning Students [P] 1C \n", | |
"2 50 ENGINEERING Returning Students [P] 1C \n", | |
"3 2264 SCHOOL OF COMPUTING Returning Students [P] 1C \n", | |
"4 747 ENGINEERING New Students [P] 1C \n", | |
"\n", | |
" Acad_Yr Semester \n", | |
"0 20112012 1 \n", | |
"1 20112012 1 \n", | |
"2 20112012 1 \n", | |
"3 20112012 1 \n", | |
"4 20112012 1 \n" | |
] | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary = bid_summary.fillna(method='pad') # we fill downwards\n", | |
"bid_summary" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<pre>\n", | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175383 entries, 0 to 175382\n", | |
"Data columns (total 12 columns):\n", | |
"Module 175383 non-null values\n", | |
"Group 175383 non-null values\n", | |
"Quota 175383 non-null values\n", | |
"No_of_Bidders 175383 non-null values\n", | |
"Lowest_Bid 175383 non-null values\n", | |
"Lowest_Succ_Bid 175383 non-null values\n", | |
"Highest_Bid 175383 non-null values\n", | |
"Faculty 175383 non-null values\n", | |
"Student_Type 175383 non-null values\n", | |
"Bid_Round 175383 non-null values\n", | |
"Acad_Yr 175383 non-null values\n", | |
"Semester 175383 non-null values\n", | |
"dtypes: int64(2), object(10)\n", | |
"</pre>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 4, | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175383 entries, 0 to 175382\n", | |
"Data columns (total 12 columns):\n", | |
"Module 175383 non-null values\n", | |
"Group 175383 non-null values\n", | |
"Quota 175383 non-null values\n", | |
"No_of_Bidders 175383 non-null values\n", | |
"Lowest_Bid 175383 non-null values\n", | |
"Lowest_Succ_Bid 175383 non-null values\n", | |
"Highest_Bid 175383 non-null values\n", | |
"Faculty 175383 non-null values\n", | |
"Student_Type 175383 non-null values\n", | |
"Bid_Round 175383 non-null values\n", | |
"Acad_Yr 175383 non-null values\n", | |
"Semester 175383 non-null values\n", | |
"dtypes: int64(2), object(10)" | |
] | |
} | |
], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print(bid_summary.head()) # now, observe the duplicate header rows (eg. row 0)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Module Group Quota No_of_Bidders Lowest_Bid Lowest_Succ_Bid \\\n", | |
"0 Module Group Quota No Lowest Lowest \n", | |
"1 ACC1002 LECTURE B1 18 0 0 0 \n", | |
"2 ACC1002X LECTURE B1 61 2 1 1 \n", | |
"3 ACC1002X LECTURE B1 27 18 1 1 \n", | |
"4 ACC1002X LECTURE B1 45 42 1 1 \n", | |
"\n", | |
" Highest_Bid Faculty Student_Type Bid_Round \\\n", | |
"0 Highest Faculty Student Type 1C \n", | |
"1 0 SCHOOL OF BUSINESS Returning Students [P] 1C \n", | |
"2 50 ENGINEERING Returning Students [P] 1C \n", | |
"3 2264 SCHOOL OF COMPUTING Returning Students [P] 1C \n", | |
"4 747 ENGINEERING New Students [P] 1C \n", | |
"\n", | |
" Acad_Yr Semester \n", | |
"0 20112012 1 \n", | |
"1 20112012 1 \n", | |
"2 20112012 1 \n", | |
"3 20112012 1 \n", | |
"4 20112012 1 \n" | |
] | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary = bid_summary[bid_summary['Module'] != 'Module'] # filter out [duplicate] headers\n", | |
"bid_summary" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<pre>\n", | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175257 entries, 1 to 175382\n", | |
"Data columns (total 12 columns):\n", | |
"Module 175257 non-null values\n", | |
"Group 175257 non-null values\n", | |
"Quota 175257 non-null values\n", | |
"No_of_Bidders 175257 non-null values\n", | |
"Lowest_Bid 175257 non-null values\n", | |
"Lowest_Succ_Bid 175257 non-null values\n", | |
"Highest_Bid 175257 non-null values\n", | |
"Faculty 175257 non-null values\n", | |
"Student_Type 175257 non-null values\n", | |
"Bid_Round 175257 non-null values\n", | |
"Acad_Yr 175257 non-null values\n", | |
"Semester 175257 non-null values\n", | |
"dtypes: int64(2), object(10)\n", | |
"</pre>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 6, | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175257 entries, 1 to 175382\n", | |
"Data columns (total 12 columns):\n", | |
"Module 175257 non-null values\n", | |
"Group 175257 non-null values\n", | |
"Quota 175257 non-null values\n", | |
"No_of_Bidders 175257 non-null values\n", | |
"Lowest_Bid 175257 non-null values\n", | |
"Lowest_Succ_Bid 175257 non-null values\n", | |
"Highest_Bid 175257 non-null values\n", | |
"Faculty 175257 non-null values\n", | |
"Student_Type 175257 non-null values\n", | |
"Bid_Round 175257 non-null values\n", | |
"Acad_Yr 175257 non-null values\n", | |
"Semester 175257 non-null values\n", | |
"dtypes: int64(2), object(10)" | |
] | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"print(bid_summary.head())" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": [ | |
" Module Group Quota No_of_Bidders Lowest_Bid Lowest_Succ_Bid \\\n", | |
"1 ACC1002 LECTURE B1 18 0 0 0 \n", | |
"2 ACC1002X LECTURE B1 61 2 1 1 \n", | |
"3 ACC1002X LECTURE B1 27 18 1 1 \n", | |
"4 ACC1002X LECTURE B1 45 42 1 1 \n", | |
"5 ACC1002X LECTURE B1 30 56 51 327 \n", | |
"\n", | |
" Highest_Bid Faculty Student_Type Bid_Round \\\n", | |
"1 0 SCHOOL OF BUSINESS Returning Students [P] 1C \n", | |
"2 50 ENGINEERING Returning Students [P] 1C \n", | |
"3 2264 SCHOOL OF COMPUTING Returning Students [P] 1C \n", | |
"4 747 ENGINEERING New Students [P] 1C \n", | |
"5 635 SCHOOL OF COMPUTING New Students [P] 1C \n", | |
"\n", | |
" Acad_Yr Semester \n", | |
"1 20112012 1 \n", | |
"2 20112012 1 \n", | |
"3 20112012 1 \n", | |
"4 20112012 1 \n", | |
"5 20112012 1 \n" | |
] | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"set(bid_summary['Student_Type'])\n", | |
"\n", | |
"# the source of a number of problems\n", | |
"# i) \"NUS Students [P, G]\" has a comma inside\n", | |
"# ii) 'Returning Students [P] and ' was cropped off\n", | |
"# iii) 'Returning Students and ' was also cropped off" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 8, | |
"text": [ | |
"set(['Returning Students [P]',\n", | |
" 'Returning Students [P] and ',\n", | |
" 'NUS Students [G]',\n", | |
" 'NUS Students [P]',\n", | |
" 'NUS Students [P, G]',\n", | |
" 'New Students [P]',\n", | |
" 'Reserved for [G] in later round',\n", | |
" 'Not Available for [G]',\n", | |
" 'Returning Students and '])" | |
] | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"# remove the comma in the field: 'NUS Students [P, G]'\n", | |
"bid_summary.ix[bid_summary.Student_Type == 'NUS Students [P, G]', 'Student_Type'] = 'NUS Students [PG]'\n", | |
"set(bid_summary['Student_Type']) # doublecheck, so we can save it as comma-separated values later" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 9, | |
"text": [ | |
"set(['NUS Students [PG]',\n", | |
" 'Returning Students [P]',\n", | |
" 'Returning Students [P] and ',\n", | |
" 'NUS Students [G]',\n", | |
" 'NUS Students [P]',\n", | |
" 'New Students [P]',\n", | |
" 'Reserved for [G] in later round',\n", | |
" 'Not Available for [G]',\n", | |
" 'Returning Students and '])" | |
] | |
} | |
], | |
"prompt_number": 9 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary.ix[bid_summary.Student_Type == 'Returning Students [P] and ', 'Student_Type'] = 'Returning Students [P] and NUS Students [G]'\n", | |
"bid_summary.ix[bid_summary.Student_Type == 'Returning Students and ', 'Student_Type'] = 'Returning Students and New Students [P]'\n", | |
"set(bid_summary['Student_Type']) # doublecheck" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 10, | |
"text": [ | |
"set(['NUS Students [PG]',\n", | |
" 'Returning Students [P]',\n", | |
" 'Returning Students and New Students [P]',\n", | |
" 'NUS Students [G]',\n", | |
" 'NUS Students [P]',\n", | |
" 'New Students [P]',\n", | |
" 'Reserved for [G] in later round',\n", | |
" 'Not Available for [G]',\n", | |
" 'Returning Students [P] and NUS Students [G]'])" | |
] | |
} | |
], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"set(bid_summary['Acad_Yr']) # some of the years (0405 and 0506) are inconsistent" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 11, | |
"text": [ | |
"set([20072008,\n", | |
" 20092010,\n", | |
" 20112012,\n", | |
" 405,\n", | |
" 20062007,\n", | |
" 20082009,\n", | |
" 506,\n", | |
" 20102011,\n", | |
" 20122013])" | |
] | |
} | |
], | |
"prompt_number": 11 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary.ix[bid_summary.Acad_Yr == 405, 'Acad_Yr'] = 20042005\n", | |
"bid_summary.ix[bid_summary.Acad_Yr == 506, 'Acad_Yr'] = 20052006\n", | |
"set(bid_summary['Acad_Yr']) # some of the years are broken" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 12, | |
"text": [ | |
"set([20052006,\n", | |
" 20072008,\n", | |
" 20092010,\n", | |
" 20112012,\n", | |
" 20042005,\n", | |
" 20062007,\n", | |
" 20082009,\n", | |
" 20102011,\n", | |
" 20122013])" | |
] | |
} | |
], | |
"prompt_number": 12 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary.dtypes # let's check the datatype for the rest of the fields" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 13, | |
"text": [ | |
"Module object\n", | |
"Group object\n", | |
"Quota object\n", | |
"No_of_Bidders object\n", | |
"Lowest_Bid object\n", | |
"Lowest_Succ_Bid object\n", | |
"Highest_Bid object\n", | |
"Faculty object\n", | |
"Student_Type object\n", | |
"Bid_Round object\n", | |
"Acad_Yr int64\n", | |
"Semester int64\n", | |
"dtype: object" | |
] | |
} | |
], | |
"prompt_number": 13 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"for header in ['Quota','No_of_Bidders','Lowest_Bid','Lowest_Succ_Bid','Highest_Bid']:\n", | |
" bid_summary[header] = bid_summary[header].map(int) # convert to Int64\n", | |
"bid_summary" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<pre>\n", | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175257 entries, 1 to 175382\n", | |
"Data columns (total 12 columns):\n", | |
"Module 175257 non-null values\n", | |
"Group 175257 non-null values\n", | |
"Quota 175257 non-null values\n", | |
"No_of_Bidders 175257 non-null values\n", | |
"Lowest_Bid 175257 non-null values\n", | |
"Lowest_Succ_Bid 175257 non-null values\n", | |
"Highest_Bid 175257 non-null values\n", | |
"Faculty 175257 non-null values\n", | |
"Student_Type 175257 non-null values\n", | |
"Bid_Round 175257 non-null values\n", | |
"Acad_Yr 175257 non-null values\n", | |
"Semester 175257 non-null values\n", | |
"dtypes: int64(7), object(5)\n", | |
"</pre>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 14, | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175257 entries, 1 to 175382\n", | |
"Data columns (total 12 columns):\n", | |
"Module 175257 non-null values\n", | |
"Group 175257 non-null values\n", | |
"Quota 175257 non-null values\n", | |
"No_of_Bidders 175257 non-null values\n", | |
"Lowest_Bid 175257 non-null values\n", | |
"Lowest_Succ_Bid 175257 non-null values\n", | |
"Highest_Bid 175257 non-null values\n", | |
"Faculty 175257 non-null values\n", | |
"Student_Type 175257 non-null values\n", | |
"Bid_Round 175257 non-null values\n", | |
"Acad_Yr 175257 non-null values\n", | |
"Semester 175257 non-null values\n", | |
"dtypes: int64(7), object(5)" | |
] | |
} | |
], | |
"prompt_number": 14 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary.dtypes" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "pyout", | |
"prompt_number": 15, | |
"text": [ | |
"Module object\n", | |
"Group object\n", | |
"Quota int64\n", | |
"No_of_Bidders int64\n", | |
"Lowest_Bid int64\n", | |
"Lowest_Succ_Bid int64\n", | |
"Highest_Bid int64\n", | |
"Faculty object\n", | |
"Student_Type object\n", | |
"Bid_Round object\n", | |
"Acad_Yr int64\n", | |
"Semester int64\n", | |
"dtype: object" | |
] | |
} | |
], | |
"prompt_number": 15 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary.to_csv('nus_bidding_summary.csv',index=False)" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 16 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": [ | |
"bid_summary = pd.read_csv('nus_bidding_summary.csv')\n", | |
"bid_summary # and we're done" | |
], | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": [ | |
"<pre>\n", | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175257 entries, 0 to 175256\n", | |
"Data columns (total 12 columns):\n", | |
"Module 175257 non-null values\n", | |
"Group 175257 non-null values\n", | |
"Quota 175257 non-null values\n", | |
"No_of_Bidders 175257 non-null values\n", | |
"Lowest_Bid 175257 non-null values\n", | |
"Lowest_Succ_Bid 175257 non-null values\n", | |
"Highest_Bid 175257 non-null values\n", | |
"Faculty 175257 non-null values\n", | |
"Student_Type 175257 non-null values\n", | |
"Bid_Round 175257 non-null values\n", | |
"Acad_Yr 175257 non-null values\n", | |
"Semester 175257 non-null values\n", | |
"dtypes: int64(7), object(5)\n", | |
"</pre>" | |
], | |
"output_type": "pyout", | |
"prompt_number": 17, | |
"text": [ | |
"<class 'pandas.core.frame.DataFrame'>\n", | |
"Int64Index: 175257 entries, 0 to 175256\n", | |
"Data columns (total 12 columns):\n", | |
"Module 175257 non-null values\n", | |
"Group 175257 non-null values\n", | |
"Quota 175257 non-null values\n", | |
"No_of_Bidders 175257 non-null values\n", | |
"Lowest_Bid 175257 non-null values\n", | |
"Lowest_Succ_Bid 175257 non-null values\n", | |
"Highest_Bid 175257 non-null values\n", | |
"Faculty 175257 non-null values\n", | |
"Student_Type 175257 non-null values\n", | |
"Bid_Round 175257 non-null values\n", | |
"Acad_Yr 175257 non-null values\n", | |
"Semester 175257 non-null values\n", | |
"dtypes: int64(7), object(5)" | |
] | |
} | |
], | |
"prompt_number": 17 | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment