Skip to content

Instantly share code, notes, and snippets.

@marcelometal
Last active April 15, 2018 15:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save marcelometal/5c5f77f6076a5fcc06783b1b7c1657d6 to your computer and use it in GitHub Desktop.
Save marcelometal/5c5f77f6076a5fcc06783b1b7c1657d6 to your computer and use it in GitHub Desktop.
ElasticSearch: Candidatos politicos brasileiros
# -*- coding: utf-8 -*-
#
# Copyright (c) 2018, Marcelo Jorge Vieira <metal@alucinados.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
fields2000 = [
'DATA_GERACAO',
'HORA_GERACAO',
'ANO_ELEICAO',
'NUM_TURNO',
'DESCRICAO_ELEICAO',
'SIGLA_UF',
'SIGLA_UE',
'DESCRICAO_UE',
'CODIGO_CARGO',
'DESCRICAO_CARGO',
'NOME_CANDIDATO',
'SEQUENCIAL_CANDIDATO',
'NUMERO_CANDIDATO',
'CPF_CANDIDATO',
'NOME_URNA_CANDIDATO',
'COD_SITUACAO_CANDIDATURA',
'DES_SITUACAO_CANDIDATURA',
'NUMERO_PARTIDO',
'SIGLA_PARTIDO',
'NOME_PARTIDO',
'CODIGO_LEGENDA',
'SIGLA_LEGENDA',
'COMPOSICAO_LEGENDA',
'NOME_LEGENDA',
'CODIGO_OCUPACAO',
'DESCRICAO_OCUPACAO',
'DATA_NASCIMENTO',
'NUM_TITULO_ELEITORAL_CANDIDATO',
'IDADE_DATA_ELEICAO',
'CODIGO_SEXO',
'DESCRICAO_SEXO',
'COD_GRAU_INSTRUCAO',
'DESCRICAO_GRAU_INSTRUCAO',
'CODIGO_ESTADO_CIVIL',
'DESCRICAO_ESTADO_CIVIL',
'CODIGO_NACIONALIDADE',
'DESCRICAO_NACIONALIDADE',
'SIGLA_UF_NASCIMENTO',
'CODIGO_MUNICIPIO_NASCIMENTO',
'NOME_MUNICIPIO_NASCIMENTO',
'DESPESA_MAX_CAMPANHA',
'COD_SIT_TOT_TURNO',
'DESC_SIT_TOT_TURNO',
]
fields2012 = [
'DATA_GERACAO',
'HORA_GERACAO',
'ANO_ELEICAO',
'NUM_TURNO',
'DESCRICAO_ELEICAO',
'SIGLA_UF',
'SIGLA_UE',
'DESCRICAO_UE',
'CODIGO_CARGO',
'DESCRICAO_CARGO',
'NOME_CANDIDATO',
'SEQUENCIAL_CANDIDATO',
'NUMERO_CANDIDATO',
'CPF_CANDIDATO',
'NOME_URNA_CANDIDATO',
'COD_SITUACAO_CANDIDATURA',
'DES_SITUACAO_CANDIDATURA',
'NUMERO_PARTIDO',
'SIGLA_PARTIDO',
'NOME_PARTIDO',
'CODIGO_LEGENDA',
'SIGLA_LEGENDA',
'COMPOSICAO_LEGENDA',
'NOME_LEGENDA',
'CODIGO_OCUPACAO',
'DESCRICAO_OCUPACAO',
'DATA_NASCIMENTO',
'NUM_TITULO_ELEITORAL_CANDIDATO',
'IDADE_DATA_ELEICAO',
'CODIGO_SEXO',
'DESCRICAO_SEXO',
'COD_GRAU_INSTRUCAO',
'DESCRICAO_GRAU_INSTRUCAO',
'CODIGO_ESTADO_CIVIL',
'DESCRICAO_ESTADO_CIVIL',
'CODIGO_NACIONALIDADE',
'DESCRICAO_NACIONALIDADE',
'SIGLA_UF_NASCIMENTO',
'CODIGO_MUNICIPIO_NASCIMENTO',
'NOME_MUNICIPIO_NASCIMENTO',
'DESPESA_MAX_CAMPANHA',
'COD_SIT_TOT_TURNO',
'DESC_SIT_TOT_TURNO',
'NM_EMAIL',
]
fields2014 = [
'DATA_GERACAO',
'HORA_GERACAO',
'ANO_ELEICAO',
'NUM_TURNO',
'DESCRICAO_ELEICAO',
'SIGLA_UF',
'SIGLA_UE',
'DESCRICAO_UE',
'CODIGO_CARGO',
'DESCRICAO_CARGO',
'NOME_CANDIDATO',
'SEQUENCIAL_CANDIDATO',
'NUMERO_CANDIDATO',
'CPF_CANDIDATO',
'NOME_URNA_CANDIDATO',
'COD_SITUACAO_CANDIDATURA',
'DES_SITUACAO_CANDIDATURA',
'NUMERO_PARTIDO',
'SIGLA_PARTIDO',
'NOME_PARTIDO',
'CODIGO_LEGENDA',
'SIGLA_LEGENDA',
'COMPOSICAO_LEGENDA',
'NOME_LEGENDA',
'CODIGO_OCUPACAO',
'DESCRICAO_OCUPACAO',
'DATA_NASCIMENTO',
'NUM_TITULO_ELEITORAL_CANDIDATO',
'IDADE_DATA_ELEICAO',
'CODIGO_SEXO',
'DESCRICAO_SEXO',
'COD_GRAU_INSTRUCAO',
'DESCRICAO_GRAU_INSTRUCAO',
'CODIGO_ESTADO_CIVIL',
'DESCRICAO_ESTADO_CIVIL',
'CODIGO_COR_RACA',
'DESCRICAO_COR_RACA',
'CODIGO_NACIONALIDADE',
'DESCRICAO_NACIONALIDADE',
'SIGLA_UF_NASCIMENTO',
'CODIGO_MUNICIPIO_NASCIMENTO',
'NOME_MUNICIPIO_NASCIMENTO',
'DESPESA_MAX_CAMPANHA',
'COD_SIT_TOT_TURNO',
'DESC_SIT_TOT_TURNO',
'NM_EMAIL',
]
csv_headers = {
'2000': fields2000,
'2002': fields2000,
'2004': fields2000,
'2006': fields2000,
'2008': fields2000,
'2010': fields2000,
'2012': fields2012,
'2014': fields2014,
'2016': fields2014,
}
# -*- coding: utf-8 -*-
#
# Copyright (c) 2018, Marcelo Jorge Vieira <metal@alucinados.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import csv
import glob
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from pandas import read_csv
from pandas.errors import EmptyDataError
from fields import csv_headers
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
INDEX_NAME = 'politicos'
DOC_TYPE = 'people'
FILES_DIR = '/home/metal/Downloads/consulta_cand'
OBJECT_LIST_MAXIMUM_COUNTER = 1000
def search(prop, value):
result = es.search(
index=INDEX_NAME,
body={'query': {'match': {prop: value}}}
)
print result
def create():
es.indices.create(index=INDEX_NAME, ignore=400)
es.indices.put_settings(
index=INDEX_NAME,
body={
'index.blocks.write': False,
'index.blocks.read_only_allow_delete': False
}
)
def csv2dict(filename):
csv_rows = []
with open(filename) as csvfile:
reader = csv.DictReader(csvfile)
names = reader.fieldnames
for row in reader:
csv_rows.extend(
[
{names[i]: row[names[i]].decode('ISO-8859-1')
for i in range(len(names))}
]
)
return csv_rows
def all_elections():
for year in csv_headers.keys():
all_candidates(str(year))
def all_candidates(year):
election_dir = '{0}/consulta_cand_{1}'.format(FILES_DIR, year)
filenames = glob.glob('{0}/*.txt'.format(election_dir))
for filename in filenames:
candidates(filename, year)
def insert_candidates(es, actions):
try:
print helpers.bulk(es, actions, index=INDEX_NAME, doc_type=DOC_TYPE)
except:
es.indices.put_settings(
index=INDEX_NAME,
body={
'index.blocks.write': False,
'index.blocks.read_only_allow_delete': False
}
)
print helpers.bulk(es, actions, index=INDEX_NAME, doc_type=DOC_TYPE)
def candidates(filename, year):
try:
df = read_csv(filename, delimiter=';')
print filename
except EmptyDataError:
print '{} is empty'.format(filename)
return
df.columns = csv_headers.get(year)
header_file = '{}_header.csv'.format(filename)
df.to_csv(header_file, index=False)
rows = csv2dict(header_file)
actions = []
for row in rows:
actions.append({
'_op_type': 'index',
'_index': INDEX_NAME,
'_type': DOC_TYPE,
'_source': row,
})
if len(actions) == OBJECT_LIST_MAXIMUM_COUNTER:
insert_candidates(es, actions)
actions = []
if actions:
insert_candidates(es, actions)
actions = []
def main():
# create()
# all_candidates('2016')
# cands_dir = '{0}/consulta_cand_{1}'.format(FILES_DIR, '2016')
# filename = '{0}/consulta_cand_{1}_RJ.txt'.format(cands_dir, '2016')
# candidates(filename)
# all_elections()
search('DESCRICAO_CARGO', 'Vereador')
if __name__ == '__main__':
main()
elasticsearch>=6.0.0,<7.0.0
pandas>=0.22.0,<0.23.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment