|
#!/bin/bash |
|
|
|
# Indonesia - Subnational Administrative Boundaries |
|
# https://data.humdata.org/dataset/indonesia-administrative-boundary-polygons-lines-and-places-levels-0-4b |
|
|
|
## config |
|
PROJECT_ID=$1 |
|
REGION=$2 |
|
BQ_DATASET=$3 |
|
GCS_BUCKET=$4 |
|
|
|
## static |
|
SITEURL=https://data.humdata.org/dataset/indonesia-administrative-boundary-polygons-lines-and-places-levels-0-4b |
|
DATAURL=https://data.humdata.org/dataset/84a1d98a-790b-4d66-9d14-bbfa48500802/resource/a00a644b-d852-4f6e-aecf-fbadd61d23ba/download/idn_adm_bps_adm4_20200401_shp.zip |
|
|
|
## derived params |
|
|
|
#remove prefix by pattern |
|
DATAFILE=${DATAURL##*/} |
|
|
|
#remove suffix by pattern |
|
DATANAME=${DATAFILE%_shp.zip} |
|
|
|
GCS_PATH=gs://${GCS_BUCKET}/${SITEURL##https://} |
|
GCS_PATH_CSV=${GCS_PATH}/${DATANAME}_shp_csv |
|
GCS_PATH_UNZIPPED=${GCS_PATH}/${DATANAME}_shp_upzipped |
|
BQ_TABLE=${DATANAME} |
|
|
|
ZIPFILE=${GCS_PATH}/${DATANAME}.zip |
|
DIR_UNZIPPED=${DATANAME}_shp_unzipped |
|
DIR_CSV=${DATANAME}_shp_csv |
|
|
|
|
|
function download_and_unzip_shapefile { |
|
if [ ! -d $DIR_UNZIPPED ] |
|
then |
|
mkdir $DIR_UNZIPPED |
|
fi |
|
wget -c $DATAURL |
|
unzip -q -o ${DATAFILE} -d ${DIR_UNZIPPED} |
|
gsutil cp -n -c ${DATAFILE} ${GCS_PATH} |
|
gsutil -m cp -n -c ${DIR_UNZIPPED}/* ${GCS_PATH_UNZIPPED}/ |
|
} |
|
|
|
function shp2csv { |
|
if [ ! -d $DIR_CSV ] |
|
then |
|
mkdir $DIR_CSV |
|
fi |
|
for shp in ${DIR_UNZIPPED}/*.shp ; do |
|
echo input : $shp |
|
SHP_PATH=$shp |
|
SHP_FILE=$(basename -- "$SHP_PATH") |
|
LAYER="${SHP_FILE%%.*}" |
|
ogr2ogr -f csv -makevalid -dialect sqlite \ |
|
-sql "select Shape_Leng,Shape_Area,ADM4_EN,ADM4_PCODE,ADM4_REF,ADM4ALT1EN,ADM4ALT2EN,ADM3_EN,ADM3_PCODE,ADM2_EN,ADM2_PCODE,ADM1_EN,ADM1_PCODE,ADM0_EN,ADM0_PCODE,date,validOn,validTo,AsGeoJSON(geometry) as geom from $LAYER" \ |
|
${DIR_CSV}/${LAYER}.csv ${SHP_PATH} |
|
done |
|
} |
|
|
|
|
|
function upload_csv { |
|
gsutil -m cp ${DIR_CSV}/*.csv ${GCS_PATH_CSV}/ |
|
} |
|
|
|
function prepare_gcp { |
|
#create GCS bucket |
|
gsutil mb -l $REGION $GCS_BUCKET |
|
|
|
#create BQ Dataset (temp) |
|
bq --project_id $PROJECT_ID --location=$REGION \ |
|
mk \ |
|
--description "Indonesia administrative level 4 (village - desa, kelurahan, kampung, nagari, pekon or gampong) boundary shapefiles for administrative level 1 features ID1, ID2, ID3, ID5, ID6, ID7, ID8 and ID9. (https://creativecommons.org/licenses/by/3.0/igo/) " \ |
|
--label=owner:`whoami` \ |
|
--label=country:indonesia \ |
|
--label=license:cc-by-igo \ |
|
${BQ_DATASET} |
|
echo "check console : https://console.cloud.google.com/bigquery?project=${PROJECT_ID}&p=${PROJECT_ID}&d=${DATASET}&page=dataset" |
|
|
|
#create BQ Dataset |
|
bq --project_id $PROJECT_ID --location=$REGION \ |
|
mk \ |
|
--description "temporary dataset. expires in 1 hour" \ |
|
--default_table_expiration 3600 \ |
|
${BQ_DATASET}_temp |
|
echo "check console : https://console.cloud.google.com/bigquery?project=${PROJECT_ID}&p=${PROJECT_ID}&d=${DATASET}_temp&page=dataset" |
|
} |
|
|
|
function bq_create_table_temp { |
|
bq --project_id $PROJECT_ID --location=$REGION \ |
|
load \ |
|
--autodetect \ |
|
--replace \ |
|
--skip_leading_rows 1 \ |
|
${BQ_DATASET}_temp.${BQ_TABLE} \ |
|
${GCS_PATH_CSV}/*.csv "Shape_Leng:FLOAT64,Shape_Area:FLOAT64,ADM4_EN,ADM4_PCODE,ADM4_REF,ADM4ALT1EN,ADM4ALT2EN,ADM3_EN,ADM3_PCODE,ADM2_EN,ADM2_PCODE,ADM1_EN,ADM1_PCODE,ADM0_EN,ADM0_PCODE,date,validOn,validTo,geom" |
|
} |
|
|
|
|
|
function bq_create_table { |
|
cat <<EOF | bq --project_id $PROJECT_ID --location=$REGION query --nouse_legacy_sql |
|
create or replace table ${BQ_DATASET}.${BQ_TABLE} |
|
CLUSTER BY geom |
|
as ( |
|
select * replace ( |
|
ST_GeogFromGeoJson(geom,make_valid => TRUE) AS geom, |
|
PARSE_DATE('%Y/%m/%d',\`date\`) as \`date\`, |
|
PARSE_DATE('%Y/%m/%d',validOn) as validOn, |
|
PARSE_DATE('%Y/%m/%d',validTo) as validTo |
|
) |
|
from ${BQ_DATASET}_temp.${BQ_TABLE} |
|
); |
|
ALTER TABLE ${BQ_DATASET}.${BQ_TABLE} |
|
SET OPTIONS ( |
|
labels=[("owner", "moritani"), ("country", "indonesia"), ("license", "cc-by-igo")] |
|
) |
|
EOF |
|
} |
|
|
|
function bq_create_table_external { |
|
cat <<EOF | bq --project_id $PROJECT_ID --location=$REGION query --nouse_legacy_sql |
|
create or replace external table ${BQ_DATASET}_temp.${BQ_TABLE}_ext |
|
( |
|
Shape_Leng FLOAT64, |
|
Shape_Area FLOAT64, |
|
ADM4_EN STRING, |
|
ADM4_PCODE STRING, |
|
ADM4_REF STRING, |
|
ADM4ALT1EN STRING, |
|
ADM4ALT2EN STRING, |
|
ADM3_EN STRING, |
|
ADM3_PCODE STRING, |
|
ADM2_EN STRING, |
|
ADM2_PCODE STRING, |
|
ADM1_EN STRING, |
|
ADM1_PCODE STRING, |
|
ADM0_EN STRING, |
|
ADM0_PCODE STRING, |
|
\`date\` STRING, |
|
validOn STRING, |
|
validTo STRING, |
|
geom STRING |
|
) |
|
OPTIONS ( |
|
format = 'CSV', |
|
uris = ['${GCS_PATH_CSV}/*.csv'], |
|
skip_leading_rows = 1 |
|
); |
|
#this does not work for external tables |
|
#ALTER TABLE ${BQ_DATASET}_temp.${BQ_TABLE}_ext |
|
#SET OPTIONS ( |
|
# labels=[("owner", "moritani"), ("country", "indonesia"), ("license", "cc-by-igo")] |
|
#) |
|
EOF |
|
} |
|
|
|
|
|
function bq_drop_tables { |
|
cat <<EOF | bq --project_id $PROJECT_ID --location=$REGION query --nouse_legacy_sql |
|
DROP TABLE ${BQ_DATASET}.${BQ_TABLE}; |
|
DROP TABLE ${BQ_DATASET}_temp.${BQ_TABLE}; |
|
DROP TABLE ${BQ_DATASET}.${BQ_TABLE}_ext; |
|
EOF |
|
} |
|
|
|
function main { |
|
prepare_gcp |
|
download_and_unzip_shapefile |
|
shp2csv |
|
upload_csv |
|
bq_create_table_temp |
|
bq_create_table |
|
} |
|
|
|
|
|
function display_usage () { |
|
echo "-------------------------------------------------------------------------------------------------" |
|
echo "This script will download and ingest Indonesia admin boundaries to BigQuery" |
|
echo "- Make sure Google Cloud SDK (gcloud) is installed and authenticated with intended Google Account" |
|
echo "- GCP services that will be used : BigQuery, Cloud Storage" |
|
echo "- Other prerequisites : wget, unzip" |
|
echo "Usage : $0 [PROJECT_ID] [REGION] [BQ_DATASET] [GCS_BUCKET]" |
|
echo " ex) $0 xxxx-opendata asia-southeast1 indonesia-geospatial xxxx-opendata-indonesia" |
|
echo " note : BigQuery Dataset IDs must be alphanumeric (plus underscores) and must be at most 1024 characters long." |
|
echo "-------------------------------------------------------------------------------------------------" |
|
} |
|
if [ $# -lt 4 ] |
|
then |
|
display_usage |
|
exit 1 |
|
else |
|
main |
|
fi |
|
|
|
if [[ ( $# == "--help") || $# == "-h" ]] |
|
then |
|
display_usage |
|
exit 0 |
|
fi |
|
|
|
|
|
|
|
#### appendix |
|
function download_xlsx { |
|
wget -c https://data.humdata.org/dataset/84a1d98a-790b-4d66-9d14-bbfa48500802/resource/7c9b543c-db67-45cf-b106-c48745e71af1/download/idn_admin4boundaries_tabulardata.xlsx |
|
wget -c https://data.humdata.org/dataset/84a1d98a-790b-4d66-9d14-bbfa48500802/resource/9f9a4843-2491-4aa3-b50f-945ea3a8d879/download/idn_admin123boundaries_tabulardata.xlsx |
|
gsutil cp idn_admin4boundaries_tabulardata.xlsx ${GCS_PATH} |
|
gsutil cp idn_admin123boundaries_tabulardata.xlsx ${GCS_PATH} |
|
} |