Created
November 27, 2020 10:59
-
-
Save kzzzr/b9e64b821a4e01bdceb5e971965bbea6 to your computer and use it in GitHub Desktop.
Spark-sql demo on GCP
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
log4j.rootLogger=ERROR, console | |
# set the log level for these components | |
log4j.logger.com.test=DEBUG | |
log4j.logger.org=ERROR | |
log4j.logger.org.apache.spark=ERROR | |
log4j.logger.org.spark-project=ERROR | |
log4j.logger.org.apache.hadoop=ERROR | |
log4j.logger.io.netty=ERROR | |
log4j.logger.org.apache.zookeeper=ERROR | |
# add a ConsoleAppender to the logger stdout to write to the console | |
log4j.appender.console=org.apache.log4j.ConsoleAppender | |
log4j.appender.console.layout=org.apache.log4j.PatternLayout | |
# use a simple message format | |
log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Инициализация | |
# устанавливаем проект | |
gcloud projects list | |
gcloud config set project <gcp-lab-278521> | |
# выбираем зону для кластера, например, europe-west3-a | |
gcloud compute regions list | |
gcloud compute zones list | |
# устанавливаем переменные среды | |
export REGION=europe-west3 | |
export ZONE=europe-west3-a | |
export PROJECT=$(gcloud info --format='value(config.project)') | |
export BUCKET_NAME=${PROJECT}-warehouse | |
# создаем бакет в google storage | |
gsutil mb -l ${REGION} gs://${BUCKET_NAME} | |
gsutil ls | |
# создадим инстанс MySQL для Hive Metastore | |
# может занять 3-5 минут | |
gcloud sql instances create \ | |
--database-version="MYSQL_5_7" \ | |
--activation-policy=ALWAYS \ | |
--zone $ZONE | |
gcloud sql instances list | |
## Создаем кластер Dataproc | |
gcloud config set compute/zone $ZONE | |
# Убедиться, что в строке --metadata в конце указано верное имя инстанса MySQL (предыдущий шаг) | |
# Можно создать кластер из одной или нескольких нод | |
# Для этого указать либо '--single-node' либо '--num-workers=2' | |
gcloud dataproc clusters create hive-cluster \ | |
--region=$REGION \ | |
--scopes cloud-platform \ | |
--image-version 1.3 \ | |
--bucket=$BUCKET_NAME \ | |
--master-machine-type=n1-standard-1 \ | |
--num-workers=2 \ | |
--worker-machine-type=n1-standard-1 \ | |
--optional-components=PRESTO \ | |
--initialization-actions gs://goog-dataproc-initialization-actions-${REGION}/cloud-sql-proxy/cloud-sql-proxy.sh \ | |
--properties hive:hive.metastore.warehouse.dir=gs://${PROJECT}-warehouse/datasets \ | |
--metadata "hive-metastore-instance=${PROJECT}:${REGION}:hive-metastore-mysql" | |
## Выгрузим второй датасет из BigQuery - chicago_taxi_trips | |
bq --location=us extract --destination_format=CSV \ | |
--field_delimiter=',' --print_header=false \ | |
"bigquery-public-data:chicago_taxi_trips.taxi_trips" \ | |
gs://${BUCKET_NAME}/chicago_taxi_trips/csv/shard-*.csv | |
# посмотрим на список выгруженных файлов (72 GiB) | |
gsutil ls -rl gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ | |
# удалим часть файлов для ускорения работы | |
gsutil ls gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ | head -282 | xargs gsutil rm # для single-node head -281 | |
gsutil ls -rl gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ # 7.71 GiB | |
# подключимся к мастер-ноде | |
gcloud compute ssh hive-cluster-m | |
# запустим сесси spark-sql | |
spark-sql | |
CREATE EXTERNAL TABLE chicago_taxi_trips_csv( | |
unique_key STRING, | |
taxi_id STRING, | |
trip_start_timestamp STRING, | |
trip_end_timestamp STRING, | |
trip_seconds INT, | |
trip_miles FLOAT, | |
pickup_census_tract INT, | |
dropoff_census_tract INT, | |
pickup_community_area INT, | |
dropoff_community_area INT, | |
fare FLOAT, | |
tips FLOAT, | |
tolls FLOAT, | |
extras FLOAT, | |
trip_total FLOAT, | |
payment_type STRING, | |
company STRING, | |
pickup_latitude FLOAT, | |
pickup_longitude FLOAT, | |
pickup_location STRING, | |
dropoff_latitude FLOAT, | |
dropoff_longitude FLOAT, | |
dropoff_location STRING) | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY ',' | |
STORED AS TEXTFILE | |
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/csv/'; | |
CREATE EXTERNAL TABLE chicago_taxi_trips_parquet( | |
unique_key STRING, | |
taxi_id STRING, | |
trip_start_timestamp STRING, | |
trip_end_timestamp STRING, | |
trip_seconds INT, | |
trip_miles FLOAT, | |
pickup_census_tract INT, | |
dropoff_census_tract INT, | |
pickup_community_area INT, | |
dropoff_community_area INT, | |
fare FLOAT, | |
tips FLOAT, | |
tolls FLOAT, | |
extras FLOAT, | |
trip_total FLOAT, | |
payment_type STRING, | |
company STRING, | |
pickup_latitude FLOAT, | |
pickup_longitude FLOAT, | |
pickup_location STRING, | |
dropoff_latitude FLOAT, | |
dropoff_longitude FLOAT, | |
dropoff_location STRING) | |
STORED AS PARQUET | |
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/parquet/'; | |
INSERT OVERWRITE TABLE chicago_taxi_trips_parquet | |
SELECT * FROM chicago_taxi_trips_csv; | |
ANALYZE TABLE chicago_taxi_trips_csv COMPUTE STATISTICS ; | |
ANALYZE TABLE chicago_taxi_trips_parquet COMPUTE STATISTICS ; | |
DESCRIBE FORMATTED chicago_taxi_trips_parquet ; | |
DESCRIBE FORMATTED chicago_taxi_trips_csv ; | |
# запустим запросы. | |
select count(*) from chicago_taxi_trips_csv where trip_miles > 50; | |
select count(*) from chicago_taxi_trips_parquet where trip_miles > 50; | |
select distinct payment_type, company from chicago_taxi_trips_csv limit 10; | |
select distinct payment_type, company from chicago_taxi_trips_parquet limit 10; | |
select company, payment_type, avg(fare), count(tips), sum(trip_total) | |
from chicago_taxi_trips_csv where trip_miles > 10 | |
group by 1, 2 order by 5 desc limit 10 ; | |
select company, payment_type, avg(fare), count(tips), sum(trip_total) | |
from chicago_taxi_trips_parquet where trip_miles > 10 | |
group by 1, 2 order by 5 desc limit 10 ; | |
# покинуть сессию spark-sql | |
ctrl+c | |
# покинуть ssh-сессию | |
ctrl+c | |
## Посмотрим внутрь Hive Metastore | |
gcloud sql connect hive-metastore-mysql --user=root # без пароля (ENTER) | |
SHOW DATABASES; | |
USE hive_metastore; | |
SHOW TABLES; | |
# bucket location в metastore | |
SELECT DB_LOCATION_URI FROM DBS; | |
# список таблиц | |
SELECT TBL_NAME, TBL_TYPE FROM TBLS; | |
# список колонок таблицы | |
SELECT COLUMN_NAME, TYPE_NAME | |
FROM COLUMNS_V2 c, TBLS t | |
WHERE c.CD_ID = t.SD_ID AND t.TBL_NAME = 'chicago_taxi_trips_parquet'; | |
# формат файла и расположение | |
SELECT INPUT_FORMAT, LOCATION | |
FROM SDS s, TBLS t | |
WHERE s.SD_ID = t.SD_ID ; | |
# покинуть сессию mysql | |
exit | |
## Освободить ресурсы, удалить инстансы и виртуалки | |
gcloud dataproc clusters delete hive-cluster --region=$REGION --quiet | |
gcloud sql instances delete hive-metastore-mysql --quiet | |
gsutil rm -r gs://${BUCKET_NAME} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment