a script to download jukyo jusho data and load into BigQuery.
Data source: 電子国土基本図(地名情報)「住居表示住所」
also utilizes csv file from the this site that is the officially published Chiiki Code (標準地域コード). (last update: 2019-03-25).
data/ | |
.DS_Store |
a script to download jukyo jusho data and load into BigQuery.
Data source: 電子国土基本図(地名情報)「住居表示住所」
also utilizes csv file from the this site that is the officially published Chiiki Code (標準地域コード). (last update: 2019-03-25).
#!/bin/bash | |
PROJECT_ID=moritani-sandbox-opendata | |
BUCKET=moritani-sandbox-opendata-japan-sg | |
REGION=asia-southeast1 | |
OPENDATA_NAME=Jukyo_Jusho | |
OPENDATA_COUNTRY=Japan | |
OPENDATA_NAME=$(echo $OPENDATA_NAME | tr '[:upper:]' '[:lower:]') | |
OPENDATA_COUNTRY=$(echo $OPENDATA_COUNTRY | tr '[:upper:]' '[:lower:]') | |
BQ_DATASET=japan_${OPENDATA_NAME} | |
# bq --project_id ${PROJECT_ID} ls ${BQ_DATASET} \ | |
# || bq --project_id ${PROJECT_ID} mk -d --data_location=${REGION} ${BQ_DATASET} | |
# create dataset if not existing yet | |
bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL | |
CREATE SCHEMA IF NOT EXISTS ${BQ_DATASET} | |
OPTIONS( | |
location="$REGION", | |
labels=[ | |
("country","$OPENDATA_COUNTRY") | |
] | |
) | |
EOSQL | |
if [ ! -d ./data ]; then | |
mkdir -p ./data | |
fi | |
if [ ! -d ./data/processed ]; then | |
mkdir -p ./data/processed | |
fi | |
# generate a list of URLs for each code represented in the code list | |
# in addition to downloading: | |
# * file is in Shift JIS to converting to UTF-8 | |
# * extracting just the code column | |
# * adding zero padding so that the code is always five digits | |
# * structuring url to fetch per-code jukyo jusho data | |
function genDownloadList { | |
local codes=$(curl -s https://www.soumu.go.jp/main_content/000608358.csv | iconv -f SHIFT_JIS -t UTF-8 | cut -f 3 -d "," | grep -v tiiki) | |
for code in $codes; do | |
local code_padded=$(printf "%05d" $code) | |
if [ ! -f ./data/downloaded/saigai.gsi.go.jp/jusho/download/data/${code_padded}.zip ]; then | |
echo https://saigai.gsi.go.jp/jusho/download/data/${code_padded}.zip | |
fi | |
done | |
} | |
# parallel downloading with xargs | |
# not all urls will have data as the coverage across codes is partial | |
function download { | |
genDownloadList | xargs -n 1 -P 0 wget -q -c -x -r -P ./data/downloaded | |
} | |
# unzipping all downloaded files | |
function unzipAll { | |
local files=$(find ./data/downloaded -name "*.zip") | |
echo ${#files[@]} files | |
for file in $files ; do | |
#echo $file | |
local filename=${file##*/} | |
local filename_prefix=${filename%.*} | |
#echo $filename_prefix | |
if [ ! -d ./data/unzipped/$filename_prefix ]; then | |
unzip -o $file -d ./data/unzipped/ | |
else | |
echo skipping $file already unzipped | |
fi | |
done | |
} | |
# unzipped content includes .shp file and .csv file | |
# * .csv is sufficient so concatenating all files into a single file and adding a header | |
# * upload the large .csv file to Cloud Storage | |
# note: only use if to use with Datasets API which does not handle multi-file datasets and also a header line. | |
function preprocessAndUpload { | |
#local files=$(find ./data/unzipped/ -name "*.csv") | |
cat $(find ./data/unzipped/ -name "*.csv") > ./data/processed/all.csv | |
wc -l ./data/processed/all.csv | |
## datasets API requires "long" so changing from "lon" in original data | |
echo city_code,jusho1,jusho2,jusho3,jcode1,jcode2,long,lat,seido > ./data/processed/all_with_header.csv | |
cat ./data/processed/all.csv >> ./data/processed/all_with_header.csv | |
gsutil cp -c ./data/processed/all_with_header.csv gs://${BUCKET}/${OPENDATA_NAME}/ | |
} | |
# unzipped content includes .shp file and .csv file | |
# * .csv is sufficient so concatenating all files into a single file | |
# * for efficiency, split the single file into multiple file (by 1M lines each) | |
# * upload files to Cloud Storage | |
function preprocessAndUpload2 { | |
#local files=$(find ./data/unzipped/ -name "*.csv") | |
cat $(find ./data/unzipped/ -name "*.csv") > ./data/processed/all.csv | |
wc -l ./data/processed/all.csv | |
if [ -d ./data/processed/split ]; then | |
rm -rf ./data/processed/split | |
fi | |
mkdir -p ./data/processed/split | |
pushd ./data/processed/split | |
split -d -l 1000000 ../all.csv ${OPENDATA_NAME}_ | |
popd | |
for file in ./data/processed/split/* ; do | |
mv -- ${file} ${file}.csv | |
done | |
gsutil -m rm gs://${BUCKET}/${OPENDATA_NAME}/split/* | |
gsutil -m cp -z csv -c ./data/processed/split/* gs://${BUCKET}/${OPENDATA_NAME}/split | |
} | |
function writeTableSchema { | |
cat <<EOF | |
[ | |
{"name": "city_code", "type": "STRING", "mode": "REQUIRED"}, | |
{"name": "jusho1", "type": "STRING", "mode": "NULLABLE"}, | |
{"name": "jusho2", "type": "STRING", "mode": "NULLABLE"}, | |
{"name": "jusho3", "type": "STRING", "mode": "NULLABLE"}, | |
{"name": "jcode1", "type": "STRING", "mode": "NULLABLE"}, | |
{"name": "jcode2", "type": "STRING", "mode": "NULLABLE"}, | |
{"name": "lon", "type": "NUMERIC", "mode": "NULLABLE"}, | |
{"name": "lat", "type": "NUMERIC", "mode": "NULLABLE"}, | |
{"name": "seido", "type": "NUMERIC", "mode": "NULLABLE"} | |
] | |
EOF | |
} | |
# loads split files into a BigQuery table. | |
# since this is a relatively large file, it can be a little slow. | |
function loadToBq { | |
local CSV_PATH=gs://${BUCKET}/${OPENDATA_NAME}/all_with_header.csv | |
writeTableSchema > ./data/schema.json | |
bq --project_id ${PROJECT_ID} load \ | |
--source_format CSV \ | |
--skip_leading_rows 1 \ | |
${BQ_DATASET}.${OPENDATA_NAME}_all_raw ${CSV_PATH} ./data/schema.json | |
} | |
# loads split files into a BigQuery table. | |
# since the load is parallelized, it should complete fairly quickly | |
function loadToBq2 { | |
local CSV_PATH=gs://${BUCKET}/${OPENDATA_NAME}/split/* | |
local TABLE_OUTPUT=${OPENDATA_NAME}_raw | |
#writeTableSchema | |
# bq --project_id ${PROJECT_ID} load --replace --source_format CSV \ | |
# ${BQ_DATASET}.${TABLE_OUTPUT} "${CSV_PATH}" ./data/schema.json | |
# load table from csv stored on gcs | |
bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL | |
load data OVERWRITE ${BQ_DATASET}.${TABLE_OUTPUT} | |
( | |
city_code STRING not null, | |
jusho1 STRING, | |
jusho2 STRING, | |
jusho3 STRING, | |
jcode1 STRING, | |
jcode2 STRING, | |
lon numeric, | |
lat numeric, | |
seido numeric | |
) | |
options( | |
description="電子国土基本図(地名情報)「住居表示住所」 https://www.gsi.go.jp/kihonjohochousa/jukyo_jusho.html", | |
labels=[ | |
("country","$OPENDATA_COUNTRY") | |
] | |
) | |
FROM FILES ( | |
format="CSV", | |
uris=[ | |
"${CSV_PATH}" | |
] | |
) | |
EOSQL | |
# bq --project_id ${PROJECT_ID} update \ | |
# --description "電子国土基本図(地名情報)「住居表示住所」 | |
# source : https://www.gsi.go.jp/kihonjohochousa/jukyo_jusho.html | |
# usage : https://www.gsi.go.jp/LAW/2930-index.html" \ | |
# ${BQ_DATASET}.${TABLE_OUTPUT} | |
} | |
function showSchema { | |
bq --project_id ${PROJECT_ID} show --schema ${BQ_DATASET}.saigai_all_raw | |
} | |
function processRaw { | |
local TABLE_OUTPUT=${OPENDATA_NAME}_processed | |
local TABLE_RAW=${OPENDATA_NAME}_raw | |
bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL | |
create or replace table ${BQ_DATASET}.${TABLE_OUTPUT} | |
partition by RANGE_BUCKET(ken_code, GENERATE_ARRAY(1, 47, 1)) | |
cluster by city_code, jusho1, jusho2, jusho3 | |
OPTIONS ( | |
description="enriched jukyo jusho data across japan", | |
labels=[ | |
("country", "japan") | |
] | |
) | |
as ( | |
SELECT | |
city_code, | |
codes.ken_code, | |
codes.ken_code_2, | |
codes.sityouson_code, | |
codes.sityouson_code_3, | |
codes.ken_name, codes.ken_name_yomigana, | |
codes.sityouson_name1, codes.sityouson_name1_yomigana, | |
# codes.sityouson_name2, codes.sityouson_name2_yomigana, | |
codes.sityouson_name3, codes.sityouson_name3_yomigana, | |
jusho1, | |
jusho2, | |
jusho3, | |
IF ( CONTAINS_SUBSTR(jusho3,"-") , (SPLIT(jusho3,"-") [ OFFSET (0)]), jusho3 ) AS jusho3_1, | |
IF ( CONTAINS_SUBSTR(jusho3,"-") and array_length(SPLIT(jusho3,"-"))>1, (SPLIT(jusho3,"-") [ OFFSET (1)]), NULL ) AS jusho3_2, | |
IF ( CONTAINS_SUBSTR(jusho3,"-") and array_length(SPLIT(jusho3,"-"))>2, (SPLIT(jusho3,"-") [ OFFSET (2)]), NULL ) AS jusho3_3, | |
lon as long, | |
lat, | |
seido, | |
ST_GEOGPOINT(lon,lat) AS geometry | |
FROM | |
${BQ_DATASET}.${TABLE_RAW} AS jukyo_jusho | |
LEFT JOIN | |
japan_chiiki_code.codes codes | |
ON | |
jukyo_jusho.city_code=codes.tiiki_code_5 | |
) | |
EOSQL | |
} | |
## process data only for Tokyo (city_code 13XXX) | |
function processTokyo { | |
bq --project_id ${PROJECT_ID} query --nouse_legacy_sql --replace <<EOSQL | |
create or replace table ${BQ_DATASET}.${OPENDATA_NAME}_processed_tokyo | |
as ( | |
select * from ${BQ_DATASET}.${OPENDATA_NAME}_processed | |
where ken_code=13 | |
) | |
EOSQL | |
} | |
"$@" |