Creates time based Glue partitions given time range.
Keep in mind that you don't need data to add partitions. So, you can create partitions for a whole year and add the data to S3 later.
PERFIL_ELEITORADO,CONSULTA_CAND_2010,CONSULTA_CAND_2012,CONSULTA_CAND_2014,BEM_CANDIDATO,CONSULTA_LEGENDAS ,CONSULTA_VAGAS ,VOTACAO_CANDIDATO_MUN_ZONA_2012,VOTACAO_CANDIDATO_MUN_ZONA_2014,VOTACAO_PARTIDO_MUN_ZONA_2012,VOTACAO_PARTIDO_MUN_ZONA_2014,VOTO_SECAO ,DETALHE_VOTACAO_MUN_ZONA_2012,DETALHE_VOTACAO_MUN_ZONA_2014 | |
PERIODO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO,DATA_GERACAO | |
UF,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO,HORA_GERACAO | |
MUNICIPIO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO,ANO_ELEICAO | |
COD_MUNICIPIO_TSE,NUM_TURNO ,NUM_TURNO ,NUM_TURNO ,DESCRICAO_ELEICAO,NUM_TURNO,DESCRICAO_ELEICAO,NUM_TURNO,NUM_TURNO,NUM_TURNO,NUM_TURNO,NUM_TURNO,NUM_TURNO,NUM_TURNO | |
NR_ZONA,DESCRICAO_ELEI |
"AC","AL","AP","AM","BA","CE","DF","ES","GO","MA","MT","MS","MG","PA","PB","PR","PE","PI","RJ","RN","RS","RO","RR","SC","SP","SE","TO" |
import xmltodict | |
import pandas as pd | |
import requests | |
xml = request.get('url').text | |
df = pd.DataFrame(xmltodict.parse(xml)) | |
df.rename(columns=lambda x: x.replace('@', ''), inplace=True) | |
df.to_csv('data.csv') |
def split_data_frame_list(df, | |
target_column, | |
output_type=float): | |
''' | |
Accepts a column with multiple types and splits list variables to several rows. | |
df: dataframe to split | |
target_column: the column containing the values to split | |
output_type: type of all outputs |
def suffix(alist): | |
if not len(alist): | |
return [[]] | |
else: | |
return [alist] + suffix(alist[1:]) | |
def preffix(alist): | |
if not len(alist): |
DROP TABLE IF EXISTS main; | |
CREATE EXTERNAL TABLE main ( | |
endTimeMillis BIGINT, | |
startTimeMillis BIGINT, | |
endTime STRING, | |
startTime STRING, | |
jams array<struct< | |
uuid: STRING, | |
pubMillis: BIGINT, |
CREATE TABLE waze.polygons_geo | |
WITH ( | |
external_location = 's3://...', | |
format = 'Parquet') AS | |
WITH dataset AS ( | |
SELECT | |
polygons | |
FROM waze.polygons) | |
SELECT | |
pol.polygon, |
Creates time based Glue partitions given time range.
Keep in mind that you don't need data to add partitions. So, you can create partitions for a whole year and add the data to S3 later.
SELECT | |
'{"type":"LineString", "coordinates":' || | |
'[' || array_join(transform(line, loc -> '[' || CAST(loc.x AS VARCHAR) || ',' || CAST(loc.y AS VARCHAR) || ']'), ',') || ']}' | |
FROM test.test |
rm -r python | |
rm lambda_layer | |
mkdir python | |
printf "[install]\nprefix=" > ~/.pydistutils.cfg | |
pip3.7 install $1 -t python/ # insert any pip available module, repeat if necessary | |
printf "" > ~/.pydistutils.cfg | |
zip -r lambda_layer.zip ./python | |
aws s3 cp lambda_layer.zip s3://config-lambda/layers/$1/ | |
aws lambda publish-layer-version \ | |
--layer-name $1 \ |