Skip to content

Instantly share code, notes, and snippets.

@ricalanis
Created December 6, 2017 18:45
Show Gist options
  • Save ricalanis/3f051c7fa3e08af9e85711ff3f10cd6f to your computer and use it in GitHub Desktop.
Save ricalanis/3f051c7fa3e08af9e85711ff3f10cd6f to your computer and use it in GitHub Desktop.
distrito rangos
0 01 [['0003', '0008'], ['0010', '0037'], ['0040', '0041'], ['1682', '1710'], ['0240', '0249'], ['0444', '0449'], ['1506', '1513'], ['1515', '1518'], ['1520', '1520']]
1 02 [['0233', '0237'], ['0238', '0239'], ['0508', '0523'], ['0577', '0610'], ['0612', '0644'], ['1548', '1567'], ['0525', '0525']]
2 03 [['0450', '0470'], ['0473', '0499'], ['0691', '0727'], ['1058', '1090']]
3 04 [['0110', '0119'], ['0121', '0126'], ['0255', '0256'], ['0500', '0507'], ['0526', '0540'], ['0728', '0729'], ['1040', '1052'], ['1054', '1057'], ['1091', '1165'], ['1166', '1171']]
4 05 [['0001', '0002'], ['0042', '0056'], ['0081', '0083'], ['0127', '0130'], ['0250', '0254'], ['0320', '0323'], ['0325', '0343'], ['0345', '0355'], ['0357', '0369'], ['0374', '0381'], ['0390', '0403'], ['0408', '0416'], ['0421', '0422'], ['0442', '0443'], ['0645', '0650'], ['1498', '1505'], ['0435', '0435'], ['0440', '0440']]
5 06 [['0084', '0094'], ['0096', '0099'], ['0101', '0109'], ['1568', '1570'], ['0173', '0213'], ['0370', '0373'], ['0382', '0389'], ['0404', '0407'], ['0417', '0420'], ['0423', '0434'], ['0436', '0439'], ['0319', '0319'], ['0324', '0324'], ['0344', '0344'], ['0356', '0356'], ['0441', '0441']]
6 07 [['0131', '0172'], ['0257', '0318'], ['1475', '1486'], ['1488', '1497'], ['1470', '1474']]
7 08 [['1172', '1187'], ['1191', '1192'], ['1201', '1202'], ['1310', '1317'], ['1459', '1467']]
8 09 [['1188', '1190'], ['1193', '1200'], ['1203', '1309'], ['1325', '1346'], ['1367', '1378'], ['1380', '1380'], ['1394', '1394'], ['1396', '1396']]
9 10 [['1391', '1393'], ['1397', '1408'], ['1412', '1458'], ['1521', '1547'], ['1571', '1610'], ['1379', '1379'], ['1381', '1381'], ['1395', '1395'], ['1410', '1410'], ['1468', '1468']]
10 11 [['1318', '1324'], ['1347', '1366'], ['1382', '1390'], ['1469', '1469']]
11 12 [['0057', '0080'], ['0214', '0231'], ['0542', '0571'], ['0573', '0576'], ['0652', '0690']]
12 13 [['0736', '0738'], ['0743', '0748'], ['0765', '0771'], ['0785', '0797'], ['0811', '0827'], ['0844', '0859'], ['0877', '0895'], ['0916', '0923'], ['0925', '0925'], ['0933', '0933']]
13 14 [['0730', '0735'], ['0739', '0742'], ['0749', '0764'], ['0772', '0780'], ['0800', '0805'], ['0868', '0869'], ['1611', '1641'], ['0829', '0829'], ['0831', '0831']]
14 15 [['0781', '0784'], ['0798', '0799'], ['0806', '0810'], ['0832', '0843'], ['0860', '0867'], ['0870', '0876'], ['0896', '0915'], ['0934', '0960'], ['0978', '0979'], ['0828', '0828']]
15 16 [['0926', '0932'], ['0961', '0977'], ['0980', '0991'], ['0993', '1039'], ['1642', '1681'], ['0924', '0924']]
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import PyPDF2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load PDF"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"pdfFileObj = open('D05.pdf', 'rb')\n",
"pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n",
"text = \"\"\n",
"for page in range(pdfReader.numPages):\n",
" pageObj = pdfReader.getPage(page)\n",
" text = text + pageObj.extractText()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Load Distritos, by capturing Distrito title"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {},
"outputs": [],
"source": [
"distritos_split = text.split(\"\\nDistrito \")\n",
"district_data = []\n",
"# We jump first element that has no sections.\n",
"for i in range(1,len(distritos_split)):\n",
" section_ranges = extract_ranges(distritos_split[i].replace(\"\\n\",\"\"))\n",
" district = distritos_split[i][1:3]\n",
" district_data.append([district, section_ranges])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Extract Ranges\n",
"- We find the ranges from document with the phrase that defines it\n",
"- Then we compare to the number present. Individual numbers is the difference between the numbers in ranges and other found"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
"def extract_ranges(text):\n",
" ranges = re.findall('[0-9][0-9][0-9][0-9] a la [0-9][0-9][0-9][0-9]', text)\n",
" single_numbers = re.findall('[0-9][0-9][0-9][0-9]', text)\n",
" sections = []\n",
" individual_sections_in_ranges = []\n",
" for datum in ranges:\n",
" selected_sections = datum.split(\" a la \")\n",
" sections.append(selected_sections)\n",
" individual_sections_in_ranges.append(selected_sections[0])\n",
" individual_sections_in_ranges.append(selected_sections[1])\n",
" sections = sections + [[section, section] for section in single_numbers if section not in individual_sections_in_ranges]\n",
" return sections"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### We validate that we capture all sections"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def sum_sections(array_sections):\n",
" sum_data = 0\n",
" for section_range in array_sections:\n",
" sum_data = sum_data + int(section_range[1]) - int(section_range[0]) + 1\n",
" return sum_data"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"sum_district = 0\n",
"for section in district_data:\n",
" sum_district = sum_district + sum_sections(section[1]) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## We know from the map, that coahuila does have 1688. Sections"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1688"
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sum_district"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 144,
"metadata": {},
"outputs": [],
"source": [
"pd.DataFrame(district_data, columns = [\"distrito\", \"rangos\"]).to_csv(\"coahuila_distritos_locales.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['0003', '0008', '0040', '0041', '1682', '1710', '0240', '0249', '0444', '0449', '1506', '1513']\n",
"['0003', '0008', '0010', '0037', '0040', '0041', '1682', '1710', '0240', '0249', '0444', '0449', '1506', '1513', '1515', '1518', '1520']\n"
]
},
{
"data": {
"text/plain": [
"[['0003', '0008'],\n",
" ['0040', '0041'],\n",
" ['1682', '1710'],\n",
" ['0240', '0249'],\n",
" ['0444', '0449'],\n",
" ['1506', '1513'],\n",
" ['0010', '0010'],\n",
" ['0037', '0037'],\n",
" ['1515', '1515'],\n",
" ['1518', '1518'],\n",
" ['1520', '1520']]"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"extract_ranges(distritos_split[1])"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import re"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"m = re.findall('[0-9][0-9][0-9][0-9]', crude_data[1])"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['0233',\n",
" '0237',\n",
" '0238',\n",
" '0239',\n",
" '0508',\n",
" '0523',\n",
" '0525',\n",
" '0577',\n",
" '0610',\n",
" '0612',\n",
" '0644',\n",
" '1548',\n",
" '1567']"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"m"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Non Kanon"
]
},
{
"cell_type": "code",
"execution_count": 145,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"17\n"
]
}
],
"source": [
"print(len(text.split(\"\\nDistrito \")))"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"distritos_split = text.split(\"\\nDistrito \")"
]
},
{
"cell_type": "code",
"execution_count": 147,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['\\n ', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16']\n"
]
}
],
"source": [
"print([distrito[1:3] for distrito in distritos_split])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment