Skip to content

Instantly share code, notes, and snippets.

@kzzzr
Created November 27, 2020 10:59
Show Gist options
  • Save kzzzr/b9e64b821a4e01bdceb5e971965bbea6 to your computer and use it in GitHub Desktop.
Save kzzzr/b9e64b821a4e01bdceb5e971965bbea6 to your computer and use it in GitHub Desktop.
Spark-sql demo on GCP
log4j.rootLogger=ERROR, console
# set the log level for these components
log4j.logger.com.test=DEBUG
log4j.logger.org=ERROR
log4j.logger.org.apache.spark=ERROR
log4j.logger.org.spark-project=ERROR
log4j.logger.org.apache.hadoop=ERROR
log4j.logger.io.netty=ERROR
log4j.logger.org.apache.zookeeper=ERROR
# add a ConsoleAppender to the logger stdout to write to the console
log4j.appender.console=org.apache.log4j.ConsoleAppender
log4j.appender.console.layout=org.apache.log4j.PatternLayout
# use a simple message format
log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
## Инициализация
# устанавливаем проект
gcloud projects list
gcloud config set project <gcp-lab-278521>
# выбираем зону для кластера, например, europe-west3-a
gcloud compute regions list
gcloud compute zones list
# устанавливаем переменные среды
export REGION=europe-west3
export ZONE=europe-west3-a
export PROJECT=$(gcloud info --format='value(config.project)')
export BUCKET_NAME=${PROJECT}-warehouse
# создаем бакет в google storage
gsutil mb -l ${REGION} gs://${BUCKET_NAME}
gsutil ls
# создадим инстанс MySQL для Hive Metastore
# может занять 3-5 минут
gcloud sql instances create \
--database-version="MYSQL_5_7" \
--activation-policy=ALWAYS \
--zone $ZONE
gcloud sql instances list
## Создаем кластер Dataproc
gcloud config set compute/zone $ZONE
# Убедиться, что в строке --metadata в конце указано верное имя инстанса MySQL (предыдущий шаг)
# Можно создать кластер из одной или нескольких нод
# Для этого указать либо '--single-node' либо '--num-workers=2'
gcloud dataproc clusters create hive-cluster \
--region=$REGION \
--scopes cloud-platform \
--image-version 1.3 \
--bucket=$BUCKET_NAME \
--master-machine-type=n1-standard-1 \
--num-workers=2 \
--worker-machine-type=n1-standard-1 \
--optional-components=PRESTO \
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/cloud-sql-proxy/cloud-sql-proxy.sh \
--properties hive:hive.metastore.warehouse.dir=gs://${PROJECT}-warehouse/datasets \
--metadata "hive-metastore-instance=${PROJECT}:${REGION}:hive-metastore-mysql"
## Выгрузим второй датасет из BigQuery - chicago_taxi_trips
bq --location=us extract --destination_format=CSV \
--field_delimiter=',' --print_header=false \
"bigquery-public-data:chicago_taxi_trips.taxi_trips" \
gs://${BUCKET_NAME}/chicago_taxi_trips/csv/shard-*.csv
# посмотрим на список выгруженных файлов (72 GiB)
gsutil ls -rl gs://${BUCKET_NAME}/chicago_taxi_trips/csv/
# удалим часть файлов для ускорения работы
gsutil ls gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ | head -282 | xargs gsutil rm # для single-node head -281
gsutil ls -rl gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ # 7.71 GiB
# подключимся к мастер-ноде
gcloud compute ssh hive-cluster-m
# запустим сесси spark-sql
spark-sql
CREATE EXTERNAL TABLE chicago_taxi_trips_csv(
unique_key STRING,
taxi_id STRING,
trip_start_timestamp STRING,
trip_end_timestamp STRING,
trip_seconds INT,
trip_miles FLOAT,
pickup_census_tract INT,
dropoff_census_tract INT,
pickup_community_area INT,
dropoff_community_area INT,
fare FLOAT,
tips FLOAT,
tolls FLOAT,
extras FLOAT,
trip_total FLOAT,
payment_type STRING,
company STRING,
pickup_latitude FLOAT,
pickup_longitude FLOAT,
pickup_location STRING,
dropoff_latitude FLOAT,
dropoff_longitude FLOAT,
dropoff_location STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/csv/';
CREATE EXTERNAL TABLE chicago_taxi_trips_parquet(
unique_key STRING,
taxi_id STRING,
trip_start_timestamp STRING,
trip_end_timestamp STRING,
trip_seconds INT,
trip_miles FLOAT,
pickup_census_tract INT,
dropoff_census_tract INT,
pickup_community_area INT,
dropoff_community_area INT,
fare FLOAT,
tips FLOAT,
tolls FLOAT,
extras FLOAT,
trip_total FLOAT,
payment_type STRING,
company STRING,
pickup_latitude FLOAT,
pickup_longitude FLOAT,
pickup_location STRING,
dropoff_latitude FLOAT,
dropoff_longitude FLOAT,
dropoff_location STRING)
STORED AS PARQUET
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/parquet/';
INSERT OVERWRITE TABLE chicago_taxi_trips_parquet
SELECT * FROM chicago_taxi_trips_csv;
ANALYZE TABLE chicago_taxi_trips_csv COMPUTE STATISTICS ;
ANALYZE TABLE chicago_taxi_trips_parquet COMPUTE STATISTICS ;
DESCRIBE FORMATTED chicago_taxi_trips_parquet ;
DESCRIBE FORMATTED chicago_taxi_trips_csv ;
# запустим запросы.
select count(*) from chicago_taxi_trips_csv where trip_miles > 50;
select count(*) from chicago_taxi_trips_parquet where trip_miles > 50;
select distinct payment_type, company from chicago_taxi_trips_csv limit 10;
select distinct payment_type, company from chicago_taxi_trips_parquet limit 10;
select company, payment_type, avg(fare), count(tips), sum(trip_total)
from chicago_taxi_trips_csv where trip_miles > 10
group by 1, 2 order by 5 desc limit 10 ;
select company, payment_type, avg(fare), count(tips), sum(trip_total)
from chicago_taxi_trips_parquet where trip_miles > 10
group by 1, 2 order by 5 desc limit 10 ;
# покинуть сессию spark-sql
ctrl+c
# покинуть ssh-сессию
ctrl+c
## Посмотрим внутрь Hive Metastore
gcloud sql connect hive-metastore-mysql --user=root # без пароля (ENTER)
SHOW DATABASES;
USE hive_metastore;
SHOW TABLES;
# bucket location в metastore
SELECT DB_LOCATION_URI FROM DBS;
# список таблиц
SELECT TBL_NAME, TBL_TYPE FROM TBLS;
# список колонок таблицы
SELECT COLUMN_NAME, TYPE_NAME
FROM COLUMNS_V2 c, TBLS t
WHERE c.CD_ID = t.SD_ID AND t.TBL_NAME = 'chicago_taxi_trips_parquet';
# формат файла и расположение
SELECT INPUT_FORMAT, LOCATION
FROM SDS s, TBLS t
WHERE s.SD_ID = t.SD_ID ;
# покинуть сессию mysql
exit
## Освободить ресурсы, удалить инстансы и виртуалки
gcloud dataproc clusters delete hive-cluster --region=$REGION --quiet
gcloud sql instances delete hive-metastore-mysql --quiet
gsutil rm -r gs://${BUCKET_NAME}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment