João Carabetta JoaoCarabetta

## download_all_lambda_functions.sh
# Parallelly download all aws-lambda functions
# Assumes you have ran `aws configure` and have output-mode as "text"
# Works with "aws-cli/1.16.72 Python/3.6.7 Linux/4.15.0-42-generic botocore/1.12.62"
download_code () {
    local OUTPUT=$1
	aws lambda get-function --function-name $OUTPUT | head -n 1 | cut -f 2 | xargs wget -O ./lambda_functions/$OUTPUT.zip
}

mkdir lambda_functions
for run in $(aws lambda list-functions | cut -f 6 | xargs);

## parlamentares_na_tramitacao.sql
CREATE TEMP FUNCTION accent2latin(word STRING) AS
((
  WITH lookups AS (
    SELECT
    'ã,ç,æ,œ,á,é,í,ó,ú,à,è,ì,ò,ù,ä,ë,ï,ö,ü,ÿ,â,ê,î,ô,û,å,ø,Ø,Å,Á,À,Â,Ä,È,É,Ê,Ë,Í,Î,Ï,Ì,Ò,Ó,Ô,Ö,Ú,Ù,Û,Ü,Ÿ,Ç,Æ,Œ,ñ' AS accents,
    'a,c,ae,oe,a,e,i,o,u,a,e,i,o,u,a,e,i,o,u,y,a,e,i,o,u,a,o,O,A,A,A,A,A,E,E,E,E,I,I,I,I,O,O,O,O,U,U,U,U,Y,C,AE,OE,n' AS latins
  ),
  pairs AS (
    SELECT accent, latin FROM lookups,
      UNNEST(SPLIT(accents)) AS accent WITH OFFSET AS p1,

## Accent_to_ascii_bigquery.sql
CREATE TEMP FUNCTION accent2latin(word STRING) AS
((
  WITH lookups AS (
    SELECT
    'ã,ç,æ,œ,á,é,í,ó,ú,à,è,ì,ò,ù,ä,ë,ï,ö,ü,ÿ,â,ê,î,ô,û,å,ø,Ø,Å,Á,À,Â,Ä,È,É,Ê,Ë,Í,Î,Ï,Ì,Ò,Ó,Ô,Ö,Ú,Ù,Û,Ü,Ÿ,Ç,Æ,Œ,ñ' AS accents,
    'a,c,ae,oe,a,e,i,o,u,a,e,i,o,u,a,e,i,o,u,y,a,e,i,o,u,a,o,O,A,A,A,A,A,E,E,E,E,I,I,I,I,O,O,O,O,U,U,U,U,Y,C,AE,OE,n' AS latins
  ),
  pairs AS (
    SELECT accent, latin FROM lookups,
      UNNEST(SPLIT(accents)) AS accent WITH OFFSET AS p1,

## bulk_delete_dynamo_tables.py
import boto3
table_filter = 'temp-cap'
dynamo = boto3.client('dynamodb')
tables = dynamo.list_tables()['TableNames']
tables = [t for t in tables if table_filter in t]
for t in tables:
    try:
        dynamo.delete_table(TableName=t)
    except:
        continue

## create_table_athena_dump.sql
-- Delete *.csv.metadata
-- aws s3 rm s3://... --recursive --exclude '*.csv'

CREATE EXTERNAL TABLE `osm_pems_ids`(
  `osm_id` bigint COMMENT '',
  `sensor_id` string COMMENT '',
  `neigh_level` bigint COMMENT '')
ROW FORMAT SERDE
  'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT

## update-function-code-aws-lambda.sh
# update-function-code-aws-lambda.sh <function path> <lambda-function-name>
# make sure that the aws-cli is configured in the same region and with a user with permission

rm temp.zip
echo ------ Zipping --------
cp $1 lambda_function.py
zip temp.zip lambda_function.py
rm lambda_function.py
echo ------ Updating Lambda  ------
aws lambda update-function-code --function-name $2 --zip-file "fileb://temp.zip"

## create_lambda_layer.sh
rm -r python
rm lambda_layer
mkdir python
printf "[install]\nprefix=" >  ~/.pydistutils.cfg
pip3.7 install $1 -t python/ # insert any pip available module, repeat if necessary
printf "" >  ~/.pydistutils.cfg
zip -r lambda_layer.zip ./python
aws s3 cp lambda_layer.zip s3://config-lambda/layers/$1/
aws lambda publish-layer-version \
    --layer-name $1 \

## linestring_to_geojson.sql
SELECT
'{"type":"LineString", "coordinates":' ||
'[' || array_join(transform(line, loc -> '[' || CAST(loc.x AS VARCHAR) || ',' || CAST(loc.y AS VARCHAR) || ']'), ',') || ']}'
FROM test.test

## README.md

      
              2 files
            
          
              4 forks
            
          
              3 comments
            
          
              6 stars
            
          
                JoaoCarabetta
                / README.md
            
            
              Last active
              February 3, 2022 16:05
            
              
                Add Time Based Glue Partitions with Lambda AWS
              
          
    Creates time based Glue partitions given time range.
Keep in mind that you don't need data to add partitions. So, you can create partitions
for a whole year and add the data to S3 later.

  
## prepare_polygon_index.sql
CREATE TABLE waze.polygons_geo
WITH (
      external_location = 's3://...',
      format = 'Parquet') AS
WITH dataset AS (
  SELECT
     polygons
  FROM waze.polygons)
SELECT
     pol.polygon,
	# Parallelly download all aws-lambda functions
	# Assumes you have ran `aws configure` and have output-mode as "text"
	# Works with "aws-cli/1.16.72 Python/3.6.7 Linux/4.15.0-42-generic botocore/1.12.62"
	download_code () {
	local OUTPUT=$1
	aws lambda get-function --function-name $OUTPUT \| head -n 1 \| cut -f 2 \| xargs wget -O ./lambda_functions/$OUTPUT.zip
	}

	mkdir lambda_functions
	for run in $(aws lambda list-functions \| cut -f 6 \| xargs);
	CREATE TEMP FUNCTION accent2latin(word STRING) AS
	((
	WITH lookups AS (
	SELECT
	'ã,ç,æ,œ,á,é,í,ó,ú,à,è,ì,ò,ù,ä,ë,ï,ö,ü,ÿ,â,ê,î,ô,û,å,ø,Ø,Å,Á,À,Â,Ä,È,É,Ê,Ë,Í,Î,Ï,Ì,Ò,Ó,Ô,Ö,Ú,Ù,Û,Ü,Ÿ,Ç,Æ,Œ,ñ' AS accents,
	'a,c,ae,oe,a,e,i,o,u,a,e,i,o,u,a,e,i,o,u,y,a,e,i,o,u,a,o,O,A,A,A,A,A,E,E,E,E,I,I,I,I,O,O,O,O,U,U,U,U,Y,C,AE,OE,n' AS latins
	),
	pairs AS (
	SELECT accent, latin FROM lookups,
	UNNEST(SPLIT(accents)) AS accent WITH OFFSET AS p1,
	import boto3
	table_filter = 'temp-cap'
	dynamo = boto3.client('dynamodb')
	tables = dynamo.list_tables()['TableNames']
	tables = [t for t in tables if table_filter in t]
	for t in tables:
	try:
	dynamo.delete_table(TableName=t)
	except:
	continue
	-- Delete *.csv.metadata
	-- aws s3 rm s3://... --recursive --exclude '*.csv'

	CREATE EXTERNAL TABLE `osm_pems_ids`(
	`osm_id` bigint COMMENT '',
	`sensor_id` string COMMENT '',
	`neigh_level` bigint COMMENT '')
	ROW FORMAT SERDE
	'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
	STORED AS INPUTFORMAT
	# update-function-code-aws-lambda.sh <function path> <lambda-function-name>
	# make sure that the aws-cli is configured in the same region and with a user with permission

	rm temp.zip
	echo ------ Zipping --------
	cp $1 lambda_function.py
	zip temp.zip lambda_function.py
	rm lambda_function.py
	echo ------ Updating Lambda ------
	aws lambda update-function-code --function-name $2 --zip-file "fileb://temp.zip"
	rm -r python
	rm lambda_layer
	mkdir python
	printf "[install]\nprefix=" > ~/.pydistutils.cfg
	pip3.7 install $1 -t python/ # insert any pip available module, repeat if necessary
	printf "" > ~/.pydistutils.cfg
	zip -r lambda_layer.zip ./python
	aws s3 cp lambda_layer.zip s3://config-lambda/layers/$1/
	aws lambda publish-layer-version \
	--layer-name $1 \
	SELECT
	'{"type":"LineString", "coordinates":' \|\|
	'[' \|\| array_join(transform(line, loc -> '[' \|\| CAST(loc.x AS VARCHAR) \|\| ',' \|\| CAST(loc.y AS VARCHAR) \|\| ']'), ',') \|\| ']}'
	FROM test.test
	CREATE TABLE waze.polygons_geo
	WITH (
	external_location = 's3://...',
	format = 'Parquet') AS
	WITH dataset AS (
	SELECT
	polygons
	FROM waze.polygons)
	SELECT
	pol.polygon,