ricalanis/coahuila_distritos_locales.csv

## coahuila_distritos_locales.csv

          
            
            distrito
            rangos

            
              0
              01
              [['0003', '0008'], ['0010', '0037'], ['0040', '0041'], ['1682', '1710'], ['0240', '0249'], ['0444', '0449'], ['1506', '1513'], ['1515', '1518'], ['1520', '1520']]

            
              1
              02
              [['0233', '0237'], ['0238', '0239'], ['0508', '0523'], ['0577', '0610'], ['0612', '0644'], ['1548', '1567'], ['0525', '0525']]

            
              2
              03
              [['0450', '0470'], ['0473', '0499'], ['0691', '0727'], ['1058', '1090']]

            
              3
              04
              [['0110', '0119'], ['0121', '0126'], ['0255', '0256'], ['0500', '0507'], ['0526', '0540'], ['0728', '0729'], ['1040', '1052'], ['1054', '1057'], ['1091', '1165'], ['1166', '1171']]

            
              4
              05
              [['0001', '0002'], ['0042', '0056'], ['0081', '0083'], ['0127', '0130'], ['0250', '0254'], ['0320', '0323'], ['0325', '0343'], ['0345', '0355'], ['0357', '0369'], ['0374', '0381'], ['0390', '0403'], ['0408', '0416'], ['0421', '0422'], ['0442', '0443'], ['0645', '0650'], ['1498', '1505'], ['0435', '0435'], ['0440', '0440']]

            
              5
              06
              [['0084', '0094'], ['0096', '0099'], ['0101', '0109'], ['1568', '1570'], ['0173', '0213'], ['0370', '0373'], ['0382', '0389'], ['0404', '0407'], ['0417', '0420'], ['0423', '0434'], ['0436', '0439'], ['0319', '0319'], ['0324', '0324'], ['0344', '0344'], ['0356', '0356'], ['0441', '0441']]

            
              6
              07
              [['0131', '0172'], ['0257', '0318'], ['1475', '1486'], ['1488', '1497'], ['1470', '1474']]

            
              7
              08
              [['1172', '1187'], ['1191', '1192'], ['1201', '1202'], ['1310', '1317'], ['1459', '1467']]

            
              8
              09
              [['1188', '1190'], ['1193', '1200'], ['1203', '1309'], ['1325', '1346'], ['1367', '1378'], ['1380', '1380'], ['1394', '1394'], ['1396', '1396']]

            
              9
              10
              [['1391', '1393'], ['1397', '1408'], ['1412', '1458'], ['1521', '1547'], ['1571', '1610'], ['1379', '1379'], ['1381', '1381'], ['1395', '1395'], ['1410', '1410'], ['1468', '1468']]

            
              10
              11
              [['1318', '1324'], ['1347', '1366'], ['1382', '1390'], ['1469', '1469']]

            
              11
              12
              [['0057', '0080'], ['0214', '0231'], ['0542', '0571'], ['0573', '0576'], ['0652', '0690']]

            
              12
              13
              [['0736', '0738'], ['0743', '0748'], ['0765', '0771'], ['0785', '0797'], ['0811', '0827'], ['0844', '0859'], ['0877', '0895'], ['0916', '0923'], ['0925', '0925'], ['0933', '0933']]

            
              13
              14
              [['0730', '0735'], ['0739', '0742'], ['0749', '0764'], ['0772', '0780'], ['0800', '0805'], ['0868', '0869'], ['1611', '1641'], ['0829', '0829'], ['0831', '0831']]

            
              14
              15
              [['0781', '0784'], ['0798', '0799'], ['0806', '0810'], ['0832', '0843'], ['0860', '0867'], ['0870', '0876'], ['0896', '0915'], ['0934', '0960'], ['0978', '0979'], ['0828', '0828']]

            
              15
              16
              [['0926', '0932'], ['0961', '0977'], ['0980', '0991'], ['0993', '1039'], ['1642', '1681'], ['0924', '0924']]

## pdf-distritos.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import PyPDF2"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load PDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "pdfFileObj = open('D05.pdf', 'rb')\n",
    "pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n",
    "text = \"\"\n",
    "for page in range(pdfReader.numPages):\n",
    "    pageObj = pdfReader.getPage(page)\n",
    "    text = text + pageObj.extractText()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load Distritos, by capturing Distrito title"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {},
   "outputs": [],
   "source": [
    "distritos_split = text.split(\"\\nDistrito \")\n",
    "district_data = []\n",
    "# We jump first element that has no sections.\n",
    "for i in range(1,len(distritos_split)):\n",
    "    section_ranges = extract_ranges(distritos_split[i].replace(\"\\n\",\"\"))\n",
    "    district = distritos_split[i][1:3]\n",
    "    district_data.append([district, section_ranges])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Extract Ranges\n",
    "- We find the ranges from document with the phrase that defines it\n",
    "- Then we compare to the number present. Individual numbers is the difference between the numbers in ranges and other found"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_ranges(text):\n",
    "    ranges = re.findall('[0-9][0-9][0-9][0-9] a la [0-9][0-9][0-9][0-9]', text)\n",
    "    single_numbers = re.findall('[0-9][0-9][0-9][0-9]', text)\n",
    "    sections = []\n",
    "    individual_sections_in_ranges = []\n",
    "    for datum in ranges:\n",
    "        selected_sections = datum.split(\" a la \")\n",
    "        sections.append(selected_sections)\n",
    "        individual_sections_in_ranges.append(selected_sections[0])\n",
    "        individual_sections_in_ranges.append(selected_sections[1])\n",
    "    sections = sections + [[section, section] for section in single_numbers if section not in individual_sections_in_ranges]\n",
    "    return sections"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### We validate that we capture all sections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 137,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def sum_sections(array_sections):\n",
    "    sum_data = 0\n",
    "    for section_range in array_sections:\n",
    "        sum_data = sum_data + int(section_range[1]) - int(section_range[0]) + 1\n",
    "    return sum_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sum_district = 0\n",
    "for section in district_data:\n",
    "    sum_district = sum_district + sum_sections(section[1]) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## We know from the map, that coahuila does have 1688. Sections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1688"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum_district"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 140,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.DataFrame(district_data, columns = [\"distrito\", \"rangos\"]).to_csv(\"coahuila_distritos_locales.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['0003', '0008', '0040', '0041', '1682', '1710', '0240', '0249', '0444', '0449', '1506', '1513']\n",
      "['0003', '0008', '0010', '0037', '0040', '0041', '1682', '1710', '0240', '0249', '0444', '0449', '1506', '1513', '1515', '1518', '1520']\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[['0003', '0008'],\n",
       " ['0040', '0041'],\n",
       " ['1682', '1710'],\n",
       " ['0240', '0249'],\n",
       " ['0444', '0449'],\n",
       " ['1506', '1513'],\n",
       " ['0010', '0010'],\n",
       " ['0037', '0037'],\n",
       " ['1515', '1515'],\n",
       " ['1518', '1518'],\n",
       " ['1520', '1520']]"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "extract_ranges(distritos_split[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "m = re.findall('[0-9][0-9][0-9][0-9]', crude_data[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['0233',\n",
       " '0237',\n",
       " '0238',\n",
       " '0239',\n",
       " '0508',\n",
       " '0523',\n",
       " '0525',\n",
       " '0577',\n",
       " '0610',\n",
       " '0612',\n",
       " '0644',\n",
       " '1548',\n",
       " '1567']"
      ]
     },
     "execution_count": 76,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Non Kanon"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17\n"
     ]
    }
   ],
   "source": [
    "print(len(text.split(\"\\nDistrito \")))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "distritos_split = text.split(\"\\nDistrito \")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['\\n ', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16']\n"
     ]
    }
   ],
   "source": [
    "print([distrito[1:3] for distrito in distritos_split])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	distrito	rangos
0	01	[['0003', '0008'], ['0010', '0037'], ['0040', '0041'], ['1682', '1710'], ['0240', '0249'], ['0444', '0449'], ['1506', '1513'], ['1515', '1518'], ['1520', '1520']]
1	02	[['0233', '0237'], ['0238', '0239'], ['0508', '0523'], ['0577', '0610'], ['0612', '0644'], ['1548', '1567'], ['0525', '0525']]
2	03	[['0450', '0470'], ['0473', '0499'], ['0691', '0727'], ['1058', '1090']]
3	04	[['0110', '0119'], ['0121', '0126'], ['0255', '0256'], ['0500', '0507'], ['0526', '0540'], ['0728', '0729'], ['1040', '1052'], ['1054', '1057'], ['1091', '1165'], ['1166', '1171']]
4	05	[['0001', '0002'], ['0042', '0056'], ['0081', '0083'], ['0127', '0130'], ['0250', '0254'], ['0320', '0323'], ['0325', '0343'], ['0345', '0355'], ['0357', '0369'], ['0374', '0381'], ['0390', '0403'], ['0408', '0416'], ['0421', '0422'], ['0442', '0443'], ['0645', '0650'], ['1498', '1505'], ['0435', '0435'], ['0440', '0440']]
5	06	[['0084', '0094'], ['0096', '0099'], ['0101', '0109'], ['1568', '1570'], ['0173', '0213'], ['0370', '0373'], ['0382', '0389'], ['0404', '0407'], ['0417', '0420'], ['0423', '0434'], ['0436', '0439'], ['0319', '0319'], ['0324', '0324'], ['0344', '0344'], ['0356', '0356'], ['0441', '0441']]
6	07	[['0131', '0172'], ['0257', '0318'], ['1475', '1486'], ['1488', '1497'], ['1470', '1474']]
7	08	[['1172', '1187'], ['1191', '1192'], ['1201', '1202'], ['1310', '1317'], ['1459', '1467']]
8	09	[['1188', '1190'], ['1193', '1200'], ['1203', '1309'], ['1325', '1346'], ['1367', '1378'], ['1380', '1380'], ['1394', '1394'], ['1396', '1396']]
9	10	[['1391', '1393'], ['1397', '1408'], ['1412', '1458'], ['1521', '1547'], ['1571', '1610'], ['1379', '1379'], ['1381', '1381'], ['1395', '1395'], ['1410', '1410'], ['1468', '1468']]
10	11	[['1318', '1324'], ['1347', '1366'], ['1382', '1390'], ['1469', '1469']]
11	12	[['0057', '0080'], ['0214', '0231'], ['0542', '0571'], ['0573', '0576'], ['0652', '0690']]
12	13	[['0736', '0738'], ['0743', '0748'], ['0765', '0771'], ['0785', '0797'], ['0811', '0827'], ['0844', '0859'], ['0877', '0895'], ['0916', '0923'], ['0925', '0925'], ['0933', '0933']]
13	14	[['0730', '0735'], ['0739', '0742'], ['0749', '0764'], ['0772', '0780'], ['0800', '0805'], ['0868', '0869'], ['1611', '1641'], ['0829', '0829'], ['0831', '0831']]
14	15	[['0781', '0784'], ['0798', '0799'], ['0806', '0810'], ['0832', '0843'], ['0860', '0867'], ['0870', '0876'], ['0896', '0915'], ['0934', '0960'], ['0978', '0979'], ['0828', '0828']]
15	16	[['0926', '0932'], ['0961', '0977'], ['0980', '0991'], ['0993', '1039'], ['1642', '1681'], ['0924', '0924']]
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 9,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import PyPDF2"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Load PDF"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 33,
	"metadata": {},
	"outputs": [],
	"source": [
	"pdfFileObj = open('D05.pdf', 'rb')\n",
	"pdfReader = PyPDF2.PdfFileReader(pdfFileObj)\n",
	"text = \"\"\n",
	"for page in range(pdfReader.numPages):\n",
	" pageObj = pdfReader.getPage(page)\n",
	" text = text + pageObj.extractText()"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Load Distritos, by capturing Distrito title"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 125,
	"metadata": {},
	"outputs": [],
	"source": [
	"distritos_split = text.split(\"\\nDistrito \")\n",
	"district_data = []\n",
	"# We jump first element that has no sections.\n",
	"for i in range(1,len(distritos_split)):\n",
	" section_ranges = extract_ranges(distritos_split[i].replace(\"\\n\",\"\"))\n",
	" district = distritos_split[i][1:3]\n",
	" district_data.append([district, section_ranges])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### Extract Ranges\n",
	"- We find the ranges from document with the phrase that defines it\n",
	"- Then we compare to the number present. Individual numbers is the difference between the numbers in ranges and other found"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 124,
	"metadata": {},
	"outputs": [],
	"source": [
	"def extract_ranges(text):\n",
	" ranges = re.findall('[0-9][0-9][0-9][0-9] a la [0-9][0-9][0-9][0-9]', text)\n",
	" single_numbers = re.findall('[0-9][0-9][0-9][0-9]', text)\n",
	" sections = []\n",
	" individual_sections_in_ranges = []\n",
	" for datum in ranges:\n",
	" selected_sections = datum.split(\" a la \")\n",
	" sections.append(selected_sections)\n",
	" individual_sections_in_ranges.append(selected_sections[0])\n",
	" individual_sections_in_ranges.append(selected_sections[1])\n",
	" sections = sections + [[section, section] for section in single_numbers if section not in individual_sections_in_ranges]\n",
	" return sections"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"### We validate that we capture all sections"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 137,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def sum_sections(array_sections):\n",
	" sum_data = 0\n",
	" for section_range in array_sections:\n",
	" sum_data = sum_data + int(section_range[1]) - int(section_range[0]) + 1\n",
	" return sum_data"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 138,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"sum_district = 0\n",
	"for section in district_data:\n",
	" sum_district = sum_district + sum_sections(section[1]) "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## We know from the map, that coahuila does have 1688. Sections"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 139,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1688"
	]
	},
	"execution_count": 139,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"sum_district"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 140,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 144,
	"metadata": {},
	"outputs": [],
	"source": [
	"pd.DataFrame(district_data, columns = [\"distrito\", \"rangos\"]).to_csv(\"coahuila_distritos_locales.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 103,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['0003', '0008', '0040', '0041', '1682', '1710', '0240', '0249', '0444', '0449', '1506', '1513']\n",
	"['0003', '0008', '0010', '0037', '0040', '0041', '1682', '1710', '0240', '0249', '0444', '0449', '1506', '1513', '1515', '1518', '1520']\n"
	]
	},
	{
	"data": {
	"text/plain": [
	"[['0003', '0008'],\n",
	" ['0040', '0041'],\n",
	" ['1682', '1710'],\n",
	" ['0240', '0249'],\n",
	" ['0444', '0449'],\n",
	" ['1506', '1513'],\n",
	" ['0010', '0010'],\n",
	" ['0037', '0037'],\n",
	" ['1515', '1515'],\n",
	" ['1518', '1518'],\n",
	" ['1520', '1520']]"
	]
	},
	"execution_count": 103,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"extract_ranges(distritos_split[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 101,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import re"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 75,
	"metadata": {},
	"outputs": [],
	"source": [
	"m = re.findall('[0-9][0-9][0-9][0-9]', crude_data[1])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 76,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"['0233',\n",
	" '0237',\n",
	" '0238',\n",
	" '0239',\n",
	" '0508',\n",
	" '0523',\n",
	" '0525',\n",
	" '0577',\n",
	" '0610',\n",
	" '0612',\n",
	" '0644',\n",
	" '1548',\n",
	" '1567']"
	]
	},
	"execution_count": 76,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"m"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Non Kanon"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 145,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"17\n"
	]
	}
	],
	"source": [
	"print(len(text.split(\"\\nDistrito \")))"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 146,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"distritos_split = text.split(\"\\nDistrito \")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 147,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"['\\n ', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16']\n"
	]
	}
	],
	"source": [
	"print([distrito[1:3] for distrito in distritos_split])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}