Skip to content

Instantly share code, notes, and snippets.

@n0531m
Last active July 10, 2023 00:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save n0531m/4d1daf12c731e72901ff3188acb7e2aa to your computer and use it in GitHub Desktop.
Save n0531m/4d1daf12c731e72901ff3188acb7e2aa to your computer and use it in GitHub Desktop.
OpenData2BigQuery - Japan - Jukyo Jusho
data/
.DS_Store
#!/bin/bash
PROJECT_ID=moritani-sandbox-opendata
BUCKET=moritani-sandbox-opendata-japan-sg
REGION=asia-southeast1
OPENDATA_NAME=Jukyo_Jusho
OPENDATA_COUNTRY=Japan
OPENDATA_NAME=$(echo $OPENDATA_NAME | tr '[:upper:]' '[:lower:]')
OPENDATA_COUNTRY=$(echo $OPENDATA_COUNTRY | tr '[:upper:]' '[:lower:]')
BQ_DATASET=japan_${OPENDATA_NAME}
# bq --project_id ${PROJECT_ID} ls ${BQ_DATASET} \
# || bq --project_id ${PROJECT_ID} mk -d --data_location=${REGION} ${BQ_DATASET}
# create dataset if not existing yet
bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL
CREATE SCHEMA IF NOT EXISTS ${BQ_DATASET}
OPTIONS(
location="$REGION",
labels=[
("country","$OPENDATA_COUNTRY")
]
)
EOSQL
if [ ! -d ./data ]; then
mkdir -p ./data
fi
if [ ! -d ./data/processed ]; then
mkdir -p ./data/processed
fi
# generate a list of URLs for each code represented in the code list
# in addition to downloading:
# * file is in Shift JIS to converting to UTF-8
# * extracting just the code column
# * adding zero padding so that the code is always five digits
# * structuring url to fetch per-code jukyo jusho data
function genDownloadList {
local codes=$(curl -s https://www.soumu.go.jp/main_content/000608358.csv | iconv -f SHIFT_JIS -t UTF-8 | cut -f 3 -d "," | grep -v tiiki)
for code in $codes; do
local code_padded=$(printf "%05d" $code)
if [ ! -f ./data/downloaded/saigai.gsi.go.jp/jusho/download/data/${code_padded}.zip ]; then
echo https://saigai.gsi.go.jp/jusho/download/data/${code_padded}.zip
fi
done
}
# parallel downloading with xargs
# not all urls will have data as the coverage across codes is partial
function download {
genDownloadList | xargs -n 1 -P 0 wget -q -c -x -r -P ./data/downloaded
}
# unzipping all downloaded files
function unzipAll {
local files=$(find ./data/downloaded -name "*.zip")
echo ${#files[@]} files
for file in $files ; do
#echo $file
local filename=${file##*/}
local filename_prefix=${filename%.*}
#echo $filename_prefix
if [ ! -d ./data/unzipped/$filename_prefix ]; then
unzip -o $file -d ./data/unzipped/
else
echo skipping $file already unzipped
fi
done
}
# unzipped content includes .shp file and .csv file
# * .csv is sufficient so concatenating all files into a single file and adding a header
# * upload the large .csv file to Cloud Storage
# note: only use if to use with Datasets API which does not handle multi-file datasets and also a header line.
function preprocessAndUpload {
#local files=$(find ./data/unzipped/ -name "*.csv")
cat $(find ./data/unzipped/ -name "*.csv") > ./data/processed/all.csv
wc -l ./data/processed/all.csv
## datasets API requires "long" so changing from "lon" in original data
echo city_code,jusho1,jusho2,jusho3,jcode1,jcode2,long,lat,seido > ./data/processed/all_with_header.csv
cat ./data/processed/all.csv >> ./data/processed/all_with_header.csv
gsutil cp -c ./data/processed/all_with_header.csv gs://${BUCKET}/${OPENDATA_NAME}/
}
# unzipped content includes .shp file and .csv file
# * .csv is sufficient so concatenating all files into a single file
# * for efficiency, split the single file into multiple file (by 1M lines each)
# * upload files to Cloud Storage
function preprocessAndUpload2 {
#local files=$(find ./data/unzipped/ -name "*.csv")
cat $(find ./data/unzipped/ -name "*.csv") > ./data/processed/all.csv
wc -l ./data/processed/all.csv
if [ -d ./data/processed/split ]; then
rm -rf ./data/processed/split
fi
mkdir -p ./data/processed/split
pushd ./data/processed/split
split -d -l 1000000 ../all.csv ${OPENDATA_NAME}_
popd
for file in ./data/processed/split/* ; do
mv -- ${file} ${file}.csv
done
gsutil -m rm gs://${BUCKET}/${OPENDATA_NAME}/split/*
gsutil -m cp -z csv -c ./data/processed/split/* gs://${BUCKET}/${OPENDATA_NAME}/split
}
function writeTableSchema {
cat <<EOF
[
{"name": "city_code", "type": "STRING", "mode": "REQUIRED"},
{"name": "jusho1", "type": "STRING", "mode": "NULLABLE"},
{"name": "jusho2", "type": "STRING", "mode": "NULLABLE"},
{"name": "jusho3", "type": "STRING", "mode": "NULLABLE"},
{"name": "jcode1", "type": "STRING", "mode": "NULLABLE"},
{"name": "jcode2", "type": "STRING", "mode": "NULLABLE"},
{"name": "lon", "type": "NUMERIC", "mode": "NULLABLE"},
{"name": "lat", "type": "NUMERIC", "mode": "NULLABLE"},
{"name": "seido", "type": "NUMERIC", "mode": "NULLABLE"}
]
EOF
}
# loads split files into a BigQuery table.
# since this is a relatively large file, it can be a little slow.
function loadToBq {
local CSV_PATH=gs://${BUCKET}/${OPENDATA_NAME}/all_with_header.csv
writeTableSchema > ./data/schema.json
bq --project_id ${PROJECT_ID} load \
--source_format CSV \
--skip_leading_rows 1 \
${BQ_DATASET}.${OPENDATA_NAME}_all_raw ${CSV_PATH} ./data/schema.json
}
# loads split files into a BigQuery table.
# since the load is parallelized, it should complete fairly quickly
function loadToBq2 {
local CSV_PATH=gs://${BUCKET}/${OPENDATA_NAME}/split/*
local TABLE_OUTPUT=${OPENDATA_NAME}_raw
#writeTableSchema
# bq --project_id ${PROJECT_ID} load --replace --source_format CSV \
# ${BQ_DATASET}.${TABLE_OUTPUT} "${CSV_PATH}" ./data/schema.json
# load table from csv stored on gcs
bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL
load data OVERWRITE ${BQ_DATASET}.${TABLE_OUTPUT}
(
city_code STRING not null,
jusho1 STRING,
jusho2 STRING,
jusho3 STRING,
jcode1 STRING,
jcode2 STRING,
lon numeric,
lat numeric,
seido numeric
)
options(
description="電子国土基本図(地名情報)「住居表示住所」 https://www.gsi.go.jp/kihonjohochousa/jukyo_jusho.html",
labels=[
("country","$OPENDATA_COUNTRY")
]
)
FROM FILES (
format="CSV",
uris=[
"${CSV_PATH}"
]
)
EOSQL
# bq --project_id ${PROJECT_ID} update \
# --description "電子国土基本図(地名情報)「住居表示住所」
# source : https://www.gsi.go.jp/kihonjohochousa/jukyo_jusho.html
# usage : https://www.gsi.go.jp/LAW/2930-index.html" \
# ${BQ_DATASET}.${TABLE_OUTPUT}
}
function showSchema {
bq --project_id ${PROJECT_ID} show --schema ${BQ_DATASET}.saigai_all_raw
}
function processRaw {
local TABLE_OUTPUT=${OPENDATA_NAME}_processed
local TABLE_RAW=${OPENDATA_NAME}_raw
bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL
create or replace table ${BQ_DATASET}.${TABLE_OUTPUT}
partition by RANGE_BUCKET(ken_code, GENERATE_ARRAY(1, 47, 1))
cluster by city_code, jusho1, jusho2, jusho3
OPTIONS (
description="enriched jukyo jusho data across japan",
labels=[
("country", "japan")
]
)
as (
SELECT
city_code,
codes.ken_code,
codes.ken_code_2,
codes.sityouson_code,
codes.sityouson_code_3,
codes.ken_name, codes.ken_name_yomigana,
codes.sityouson_name1, codes.sityouson_name1_yomigana,
# codes.sityouson_name2, codes.sityouson_name2_yomigana,
codes.sityouson_name3, codes.sityouson_name3_yomigana,
jusho1,
jusho2,
jusho3,
IF ( CONTAINS_SUBSTR(jusho3,"-") , (SPLIT(jusho3,"-") [ OFFSET (0)]), jusho3 ) AS jusho3_1,
IF ( CONTAINS_SUBSTR(jusho3,"-") and array_length(SPLIT(jusho3,"-"))>1, (SPLIT(jusho3,"-") [ OFFSET (1)]), NULL ) AS jusho3_2,
IF ( CONTAINS_SUBSTR(jusho3,"-") and array_length(SPLIT(jusho3,"-"))>2, (SPLIT(jusho3,"-") [ OFFSET (2)]), NULL ) AS jusho3_3,
lon as long,
lat,
seido,
ST_GEOGPOINT(lon,lat) AS geometry
FROM
${BQ_DATASET}.${TABLE_RAW} AS jukyo_jusho
LEFT JOIN
japan_chiiki_code.codes codes
ON
jukyo_jusho.city_code=codes.tiiki_code_5
)
EOSQL
}
## process data only for Tokyo (city_code 13XXX)
function processTokyo {
bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL
create or replace table ${BQ_DATASET}.${OPENDATA_NAME}_processed_tokyo
as (
select * from ${BQ_DATASET}.${OPENDATA_NAME}_processed
where ken_code=13
)
EOSQL
}
"$@"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment