Created
December 6, 2017 18:45
-
-
Save ricalanis/3f051c7fa3e08af9e85711ff3f10cd6f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
distrito | rangos | ||
---|---|---|---|
0 | 01 | [['0003', '0008'], ['0010', '0037'], ['0040', '0041'], ['1682', '1710'], ['0240', '0249'], ['0444', '0449'], ['1506', '1513'], ['1515', '1518'], ['1520', '1520']] | |
1 | 02 | [['0233', '0237'], ['0238', '0239'], ['0508', '0523'], ['0577', '0610'], ['0612', '0644'], ['1548', '1567'], ['0525', '0525']] | |
2 | 03 | [['0450', '0470'], ['0473', '0499'], ['0691', '0727'], ['1058', '1090']] | |
3 | 04 | [['0110', '0119'], ['0121', '0126'], ['0255', '0256'], ['0500', '0507'], ['0526', '0540'], ['0728', '0729'], ['1040', '1052'], ['1054', '1057'], ['1091', '1165'], ['1166', '1171']] | |
4 | 05 | [['0001', '0002'], ['0042', '0056'], ['0081', '0083'], ['0127', '0130'], ['0250', '0254'], ['0320', '0323'], ['0325', '0343'], ['0345', '0355'], ['0357', '0369'], ['0374', '0381'], ['0390', '0403'], ['0408', '0416'], ['0421', '0422'], ['0442', '0443'], ['0645', '0650'], ['1498', '1505'], ['0435', '0435'], ['0440', '0440']] | |
5 | 06 | [['0084', '0094'], ['0096', '0099'], ['0101', '0109'], ['1568', '1570'], ['0173', '0213'], ['0370', '0373'], ['0382', '0389'], ['0404', '0407'], ['0417', '0420'], ['0423', '0434'], ['0436', '0439'], ['0319', '0319'], ['0324', '0324'], ['0344', '0344'], ['0356', '0356'], ['0441', '0441']] | |
6 | 07 | [['0131', '0172'], ['0257', '0318'], ['1475', '1486'], ['1488', '1497'], ['1470', '1474']] | |
7 | 08 | [['1172', '1187'], ['1191', '1192'], ['1201', '1202'], ['1310', '1317'], ['1459', '1467']] | |
8 | 09 | [['1188', '1190'], ['1193', '1200'], ['1203', '1309'], ['1325', '1346'], ['1367', '1378'], ['1380', '1380'], ['1394', '1394'], ['1396', '1396']] | |
9 | 10 | [['1391', '1393'], ['1397', '1408'], ['1412', '1458'], ['1521', '1547'], ['1571', '1610'], ['1379', '1379'], ['1381', '1381'], ['1395', '1395'], ['1410', '1410'], ['1468', '1468']] | |
10 | 11 | [['1318', '1324'], ['1347', '1366'], ['1382', '1390'], ['1469', '1469']] | |
11 | 12 | [['0057', '0080'], ['0214', '0231'], ['0542', '0571'], ['0573', '0576'], ['0652', '0690']] | |
12 | 13 | [['0736', '0738'], ['0743', '0748'], ['0765', '0771'], ['0785', '0797'], ['0811', '0827'], ['0844', '0859'], ['0877', '0895'], ['0916', '0923'], ['0925', '0925'], ['0933', '0933']] | |
13 | 14 | [['0730', '0735'], ['0739', '0742'], ['0749', '0764'], ['0772', '0780'], ['0800', '0805'], ['0868', '0869'], ['1611', '1641'], ['0829', '0829'], ['0831', '0831']] | |
14 | 15 | [['0781', '0784'], ['0798', '0799'], ['0806', '0810'], ['0832', '0843'], ['0860', '0867'], ['0870', '0876'], ['0896', '0915'], ['0934', '0960'], ['0978', '0979'], ['0828', '0828']] | |
15 | 16 | [['0926', '0932'], ['0961', '0977'], ['0980', '0991'], ['0993', '1039'], ['1642', '1681'], ['0924', '0924']] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import PyPDF2" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Load PDF" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 33, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pdfFileObj = open('D05.pdf', 'rb')\n", | |
"pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n", | |
"text = \"\"\n", | |
"for page in range(pdfReader.numPages):\n", | |
" pageObj = pdfReader.getPage(page)\n", | |
" text = text + pageObj.extractText()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Load Distritos, by capturing Distrito title" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 125, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"distritos_split = text.split(\"\\nDistrito \")\n", | |
"district_data = []\n", | |
"# We jump first element that has no sections.\n", | |
"for i in range(1,len(distritos_split)):\n", | |
" section_ranges = extract_ranges(distritos_split[i].replace(\"\\n\",\"\"))\n", | |
" district = distritos_split[i][1:3]\n", | |
" district_data.append([district, section_ranges])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Extract Ranges\n", | |
"- We find the ranges from document with the phrase that defines it\n", | |
"- Then we compare to the number present. Individual numbers is the difference between the numbers in ranges and other found" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 124, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def extract_ranges(text):\n", | |
" ranges = re.findall('[0-9][0-9][0-9][0-9] a la [0-9][0-9][0-9][0-9]', text)\n", | |
" single_numbers = re.findall('[0-9][0-9][0-9][0-9]', text)\n", | |
" sections = []\n", | |
" individual_sections_in_ranges = []\n", | |
" for datum in ranges:\n", | |
" selected_sections = datum.split(\" a la \")\n", | |
" sections.append(selected_sections)\n", | |
" individual_sections_in_ranges.append(selected_sections[0])\n", | |
" individual_sections_in_ranges.append(selected_sections[1])\n", | |
" sections = sections + [[section, section] for section in single_numbers if section not in individual_sections_in_ranges]\n", | |
" return sections" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### We validate that we capture all sections" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 137, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def sum_sections(array_sections):\n", | |
" sum_data = 0\n", | |
" for section_range in array_sections:\n", | |
" sum_data = sum_data + int(section_range[1]) - int(section_range[0]) + 1\n", | |
" return sum_data" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 138, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"sum_district = 0\n", | |
"for section in district_data:\n", | |
" sum_district = sum_district + sum_sections(section[1]) " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## We know from the map, that coahuila does have 1688. Sections" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 139, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1688" | |
] | |
}, | |
"execution_count": 139, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"sum_district" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 140, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 144, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"pd.DataFrame(district_data, columns = [\"distrito\", \"rangos\"]).to_csv(\"coahuila_distritos_locales.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['0003', '0008', '0040', '0041', '1682', '1710', '0240', '0249', '0444', '0449', '1506', '1513']\n", | |
"['0003', '0008', '0010', '0037', '0040', '0041', '1682', '1710', '0240', '0249', '0444', '0449', '1506', '1513', '1515', '1518', '1520']\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[['0003', '0008'],\n", | |
" ['0040', '0041'],\n", | |
" ['1682', '1710'],\n", | |
" ['0240', '0249'],\n", | |
" ['0444', '0449'],\n", | |
" ['1506', '1513'],\n", | |
" ['0010', '0010'],\n", | |
" ['0037', '0037'],\n", | |
" ['1515', '1515'],\n", | |
" ['1518', '1518'],\n", | |
" ['1520', '1520']]" | |
] | |
}, | |
"execution_count": 103, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"extract_ranges(distritos_split[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import re" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 75, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"m = re.findall('[0-9][0-9][0-9][0-9]', crude_data[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 76, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['0233',\n", | |
" '0237',\n", | |
" '0238',\n", | |
" '0239',\n", | |
" '0508',\n", | |
" '0523',\n", | |
" '0525',\n", | |
" '0577',\n", | |
" '0610',\n", | |
" '0612',\n", | |
" '0644',\n", | |
" '1548',\n", | |
" '1567']" | |
] | |
}, | |
"execution_count": 76, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"m" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Non Kanon" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 145, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"17\n" | |
] | |
} | |
], | |
"source": [ | |
"print(len(text.split(\"\\nDistrito \")))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 146, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"distritos_split = text.split(\"\\nDistrito \")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 147, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['\\n ', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16']\n" | |
] | |
} | |
], | |
"source": [ | |
"print([distrito[1:3] for distrito in distritos_split])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.3" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment