Last active
May 13, 2018 21:36
-
-
Save binga/90dfb50edfd6d6648f90db57cab6f995 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Import necessary libraries\n", | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" 5763152 Nss68_1.0_Type1/Data/R6801T1L05.TXT\r\n" | |
] | |
} | |
], | |
"source": [ | |
"! wc -l Nss68_1.0_Type1/Data/R6801T1L05.TXT" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"5.7M records are available with that have the state - district - consumption values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def parser_level_5(x):\n", | |
" # Refer to the excel sheet for the data format in the following order\n", | |
" return [x[15:18], x[18:20], x[39:42], x[59:68], x[68:76]]\n", | |
"\n", | |
"fields = ['State', 'District', 'ItemCode', 'TC_Quantity', 'TC_Value']\n", | |
"\n", | |
"# Which item statistics would you want? Change the following number to that corresponding number\n", | |
"item_code = '113'\n", | |
"\n", | |
"# Let's parse the file and extract relevant fields using the above parser_level_5 function\n", | |
"data5 = pd.read_table(\"Nss68_1.0_Type1/Data/R6801T1L05.TXT\", header=None)#, nrows=6000)\n", | |
"data5 = data5[0].apply(lambda x: pd.Series(parser_level_5(x)))\n", | |
"data5.columns = fields\n", | |
"\n", | |
"# Filter the data that we need\n", | |
"data5 = data5[data5['ItemCode'] == item_code].reset_index(drop=True)\n", | |
"\n", | |
"# Change data types for numerical aggregations\n", | |
"for col in data5.columns:\n", | |
" data5[col] = data5[col].astype(np.int64)\n", | |
"\n", | |
"data5.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>State</th>\n", | |
" <th>District</th>\n", | |
" <th>TC_Quantity</th>\n", | |
" <th>TC_Value</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>171</td>\n", | |
" <td>1</td>\n", | |
" <td>28200</td>\n", | |
" <td>1160</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>171</td>\n", | |
" <td>2</td>\n", | |
" <td>18600</td>\n", | |
" <td>946</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>171</td>\n", | |
" <td>3</td>\n", | |
" <td>3800</td>\n", | |
" <td>160</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" State District TC_Quantity TC_Value\n", | |
"0 171 1 28200 1160\n", | |
"1 171 2 18600 946\n", | |
"2 171 3 3800 160" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data5 = data5.pivot_table(index=['State', 'District'], values=['TC_Quantity', 'TC_Value'], aggfunc=np.sum).reset_index()\n", | |
"data5.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data5.to_csv(\"States_Districts_Q_V_stats.csv\", index=False)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>st</th>\n", | |
" <th>dc</th>\n", | |
" <th>district name</th>\n", | |
" <th>state name</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td>Kupwara</td>\n", | |
" <td>Jammu & Kashmir</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td>Baramula</td>\n", | |
" <td>Jammu & Kashmir</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>Srinagar</td>\n", | |
" <td>Jammu & Kashmir</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>1</td>\n", | |
" <td>4</td>\n", | |
" <td>Badgam</td>\n", | |
" <td>Jammu & Kashmir</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>1</td>\n", | |
" <td>5</td>\n", | |
" <td>Pulwama</td>\n", | |
" <td>Jammu & Kashmir</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" st dc district name state name\n", | |
"0 1 1 Kupwara Jammu & Kashmir\n", | |
"1 1 2 Baramula Jammu & Kashmir\n", | |
"2 1 3 Srinagar Jammu & Kashmir\n", | |
"3 1 4 Badgam Jammu & Kashmir\n", | |
"4 1 5 Pulwama Jammu & Kashmir" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# <<pip install xlrd>> before running this cell\n", | |
"states_mappings = pd.read_excel(\"Nss68_1.0_Type1/Supporting Documents_68/District-codes.xls\", skiprows=2)\n", | |
"states_mappings.head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Bunch of questions:\n", | |
"- While the state field is of length 3 in the layout68_1.0_typ1.xls master reference document, the state code is often found to be of length 2 in the state_code - district mappings document (District-codes.xls). This makes the data inconsistent and thus joins are risky here." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment