Created
May 20, 2013 10:48
-
-
Save yeesian/5611582 to your computer and use it in GitHub Desktop.
A brief tutorial on the cleaning of the data scraped from NUS CORS archive
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"metadata": { | |
"name": "Data_Cleaning_(bidding_activity)" | |
}, | |
"nbformat": 3, | |
"nbformat_minor": 0, | |
"worksheets": [ | |
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": "Cleaning NUS Bidding Activities\n---\n(this is a continuation of the instructions (from step 7 onwards) at the [NUS-Bidding-History](https://github.com/yeesian/NUS-Bidding-History) repository)" | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "import pandas as pd\nprint('pandas',pd.version.version)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "pandas 0.11.0\n" | |
} | |
], | |
"prompt_number": 1 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "bid_activity = pd.read_csv('bidding_activity.csv',sep='|')\nbid_activity", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"html": "<pre>\n<class 'pandas.core.frame.DataFrame'>\nInt64Index: 77944 entries, 0 to 77943\nData columns (total 12 columns):\nBid_Amount 77944 non-null values\nBid_ID 77944 non-null values\nTime_of_Bid 77944 non-null values\nBid_Status 77944 non-null values\nGroup 77944 non-null values\nFaculty 77944 non-null values\nStudent_Type 77944 non-null values\nAccount 77944 non-null values\nModule 77944 non-null values\nBid_Round 77944 non-null values\nAcad_Yr 77944 non-null values\nSemester 77944 non-null values\ndtypes: int64(3), object(9)\n</pre>", | |
"output_type": "pyout", | |
"prompt_number": 2, | |
"text": "<class 'pandas.core.frame.DataFrame'>\nInt64Index: 77944 entries, 0 to 77943\nData columns (total 12 columns):\nBid_Amount 77944 non-null values\nBid_ID 77944 non-null values\nTime_of_Bid 77944 non-null values\nBid_Status 77944 non-null values\nGroup 77944 non-null values\nFaculty 77944 non-null values\nStudent_Type 77944 non-null values\nAccount 77944 non-null values\nModule 77944 non-null values\nBid_Round 77944 non-null values\nAcad_Yr 77944 non-null values\nSemester 77944 non-null values\ndtypes: int64(3), object(9)" | |
} | |
], | |
"prompt_number": 2 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "print(bid_activity.head())\nbid_activity.dtypes", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " Bid_Amount Bid_ID Time_of_Bid Bid_Status Group \\\n0 100 B00004391322 09-01-2013 23:37:42 Successful LECTURE SL1 \n1 1 B00004393160 10-01-2013 08:55:07 Successful LECTURE SL1 \n2 843 B00004400492 10-01-2013 13:24:33 Successful LECTURE 1 \n3 500 B00004371243 07-01-2013 09:49:30 Successful LECTURE 1 \n4 500 B00004380361 09-01-2013 11:23:27 Successful LECTURE 1 \n\n Faculty Student_Type Account Module Bid_Round Acad_Yr \\\n0 SCIENCE null null LSM3224 2A 20122013 \n1 SCIENCE null null LSM3224 2A 20122013 \n2 ARTS & SOCIAL SCIENCES null null SC2211 2A 20122013 \n3 ARTS & SOCIAL SCIENCES null null SC2211 2A 20122013 \n4 ARTS & SOCIAL SCIENCES null null SC2211 2A 20122013 \n\n Semester \n0 2 \n1 2 \n2 2 \n3 2 \n4 2 \n" | |
}, | |
{ | |
"output_type": "pyout", | |
"prompt_number": 3, | |
"text": "Bid_Amount int64\nBid_ID object\nTime_of_Bid object\nBid_Status object\nGroup object\nFaculty object\nStudent_Type object\nAccount object\nModule object\nBid_Round object\nAcad_Yr int64\nSemester int64\ndtype: object" | |
} | |
], | |
"prompt_number": 3 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "from datetime import datetime\nbid_activity['Time_of_Bid'] = bid_activity['Time_of_Bid'].map(lambda date_str: datetime.strptime(date_str, \"%d-%m-%Y %H:%M:%S\"))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 4 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "print(bid_activity.head())", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " Bid_Amount Bid_ID Time_of_Bid Bid_Status Group \\\n0 100 B00004391322 2013-01-09 23:37:42 Successful LECTURE SL1 \n1 1 B00004393160 2013-01-10 08:55:07 Successful LECTURE SL1 \n2 843 B00004400492 2013-01-10 13:24:33 Successful LECTURE 1 \n3 500 B00004371243 2013-01-07 09:49:30 Successful LECTURE 1 \n4 500 B00004380361 2013-01-09 11:23:27 Successful LECTURE 1 \n\n Faculty Student_Type Account Module Bid_Round Acad_Yr \\\n0 SCIENCE null null LSM3224 2A 20122013 \n1 SCIENCE null null LSM3224 2A 20122013 \n2 ARTS & SOCIAL SCIENCES null null SC2211 2A 20122013 \n3 ARTS & SOCIAL SCIENCES null null SC2211 2A 20122013 \n4 ARTS & SOCIAL SCIENCES null null SC2211 2A 20122013 \n\n Semester \n0 2 \n1 2 \n2 2 \n3 2 \n4 2 \n" | |
} | |
], | |
"prompt_number": 5 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "for c in ['Bid_Status','Faculty','Student_Type','Account','Bid_Round']:\n print(c,set(bid_activity[c]))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Bid_Status {'Unsuccessful', 'Successful'}\nFaculty {'YONG LOO LIN SCHOOL OF MEDICINE', 'ENGINEERING', 'SCHOOL OF COMPUTING', 'SCIENCE', 'ARTS & SOCIAL SCIENCES', 'JOINT MULTI-DISCIPLINARY PROGRAMMES', 'NUS', 'YONG SIEW TOH CONSERVATORY OF MUSIC', 'SCHOOL OF BUSINESS', 'SCHOOL OF DESIGN AND ENVIRONMENT'}\nStudent_Type {'null', 'Returning', 'New'}\nAccount {'null', 'General', 'Programme'}\nBid_Round {'1B', '1A', '2B', '3B', '3A', '2A'}\n" | |
} | |
], | |
"prompt_number": 6 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "def summarise(s):\n if s == 'null':\n return 'NA'\n else:\n return s[0]\n\nfor c in ['Bid_Status','Student_Type','Account']:\n bid_activity[c] = bid_activity[c].map(summarise)\nprint(bid_activity.head())", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": " Bid_Amount Bid_ID Time_of_Bid Bid_Status Group \\\n0 100 B00004391322 2013-01-09 23:37:42 S LECTURE SL1 \n1 1 B00004393160 2013-01-10 08:55:07 S LECTURE SL1 \n2 843 B00004400492 2013-01-10 13:24:33 S LECTURE 1 \n3 500 B00004371243 2013-01-07 09:49:30 S LECTURE 1 \n4 500 B00004380361 2013-01-09 11:23:27 S LECTURE 1 \n\n Faculty Student_Type Account Module Bid_Round Acad_Yr \\\n0 SCIENCE NA NA LSM3224 2A 20122013 \n1 SCIENCE NA NA LSM3224 2A 20122013 \n2 ARTS & SOCIAL SCIENCES NA NA SC2211 2A 20122013 \n3 ARTS & SOCIAL SCIENCES NA NA SC2211 2A 20122013 \n4 ARTS & SOCIAL SCIENCES NA NA SC2211 2A 20122013 \n\n Semester \n0 2 \n1 2 \n2 2 \n3 2 \n4 2 \n" | |
} | |
], | |
"prompt_number": 7 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "for c in ['Bid_Status','Faculty','Student_Type','Account','Bid_Round']:\n print(c,set(bid_activity[c]))", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"stream": "stdout", | |
"text": "Bid_Status {'U', 'S'}\nFaculty {'YONG LOO LIN SCHOOL OF MEDICINE', 'ENGINEERING', 'SCHOOL OF COMPUTING', 'SCIENCE', 'ARTS & SOCIAL SCIENCES', 'JOINT MULTI-DISCIPLINARY PROGRAMMES', 'NUS', 'YONG SIEW TOH CONSERVATORY OF MUSIC', 'SCHOOL OF BUSINESS', 'SCHOOL OF DESIGN AND ENVIRONMENT'}\nStudent_Type {'NA', 'N', 'R'}\nAccount {'NA', 'G', 'P'}\nBid_Round {'1B', '1A', '2B', '3B', '3A', '2A'}\n" | |
} | |
], | |
"prompt_number": 8 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "bid_activity.to_csv('nus_bidding_activity.csv',index=False)", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [], | |
"prompt_number": 10 | |
}, | |
{ | |
"cell_type": "code", | |
"collapsed": false, | |
"input": "", | |
"language": "python", | |
"metadata": {}, | |
"outputs": [] | |
} | |
], | |
"metadata": {} | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment