Skip to content

Instantly share code, notes, and snippets.

@mxcoder
Created October 4, 2017 03:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mxcoder/36d6f251835622cdafebe5ca1418bd9e to your computer and use it in GitHub Desktop.
Save mxcoder/36d6f251835622cdafebe5ca1418bd9e to your computer and use it in GitHub Desktop.
Script used to normalize and join datasets from Mexico's Education Ministry for all schools that are now open after the Earthquakes of Sep 2017
# coding: utf-8
import os
import csv
import pandas as pd
import unidecode
directory = '2017-10-03/'
estados = ['CDMX','CHIAPAS','EDOMEX','GUERRERO','HIDALGO','MICHOACAN','MORELOS','OAXACA','PUEBLA','TLAXCALA']
files = [f for f in os.listdir(os.path.join(directory, 'csv'))]
frames = []
for filename in files:
try:
f = open('csv/'+filename, 'r')
# Process CSV to remove headers and possible unwanted rows
skips = 0
headers = False
reader = csv.reader(f)
preheaders = [filename.upper()]
for line in reader:
first = line[0].strip()
# Look for first cell with expected value
if ((first == 'CCT') or (first == 'CLAVE')):
headers = line
break
else:
skips += 1
preheaders.append((''.join(list(filter(None, line))).upper()))
# Normalize and detect estado from lines above headers
preheaders = str(unidecode.unidecode('|'.join(preheaders)).encode("ascii"))
estado = 'CDMX'
for _estado in estados:
if _estado in preheaders:
estado = _estado
# Create DataFrame from CSV
f.seek(0)
df = pd.read_csv(f, skip_blank_lines=True, skiprows=skips)
# Normalize DataFrame
df.columns = df.columns.str.upper()
df.columns = df.columns.str.strip()
df['ESTADO'] = estado
if 'CLAVE' in df.columns:
df['CCT'] = df['CLAVE']
df.drop('CLAVE', 1, inplace=True)
if 'DELEGACIÓN' in df.columns:
df['MUNICIPIO'] = df['DELEGACIÓN']
df.drop('DELEGACIÓN', 1, inplace=True)
if 'DIRECCIÓN' in df.columns:
df['DIRECCION'] = df['DIRECCIÓN']
df.drop('DIRECCIÓN', 1, inplace=True)
if 'DIRECCION' not in df.columns:
df['DIRECCION'] = df['ESTADO']
if 'SOSTENIMIENTO' not in df.columns:
df['SOSTENIMIENTO'] = 'Público'
# Append to frames list
frames.append(df[['CCT','NIVEL','NOMBRE', 'SOSTENIMIENTO', 'ESTADO', 'MUNICIPIO', 'DIRECCION']])
except Exception as e:
print('file {}, line {}: {}'.format(filename, reader.line_num, e))
f.close()
# Join and export
fulldf = pd.concat(frames)
for colname in fulldf.columns:
fulldf[colname] = fulldf[colname].str.strip()
fulldf.to_csv('completo.csv', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment