Last active
March 16, 2021 19:11
-
-
Save kzzzr/3447f5d7e316d6c05202d3d2894a5807 to your computer and use it in GitHub Desktop.
DE - File formats
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## Инициализация | |
# устанавливаем проект | |
gcloud projects list | |
gcloud config set project my-project | |
# выбираем зону для кластера, например, europe-west3-a | |
gcloud compute regions list | |
gcloud compute zones list | |
# устанавливаем переменные среды | |
export REGION=europe-west3 | |
export ZONE=europe-west3-a | |
export PROJECT=$(gcloud info --format='value(config.project)') | |
export BUCKET_NAME=${PROJECT}-warehouse | |
gcloud config set compute/zone $ZONE | |
# создаем бакет в google storage | |
gsutil mb -l ${REGION} gs://${BUCKET_NAME} | |
gsutil ls | |
## Создаем кластер Dataproc | |
gcloud dataproc clusters create presto-cluster \ | |
--project=${PROJECT} \ | |
--region=${REGION} \ | |
--master-machine-type=n1-standard-2 \ | |
--worker-machine-type=n1-standard-2 \ | |
--num-workers=2 \ | |
--scopes=cloud-platform \ | |
--optional-components=PRESTO \ | |
--image-version=1.5 \ | |
--enable-component-gateway | |
## Выгрузим датасет из BigQuery - chicago_taxi_trips | |
bq extract \ | |
--location=us \ | |
--destination_format=CSV \ | |
--field_delimiter=',' \ | |
--print_header=false \ | |
"bigquery-public-data:chicago_taxi_trips.taxi_trips" \ | |
gs://${BUCKET_NAME}/chicago_taxi_trips/csv/shard-*.csv | |
# посмотрим на список выгруженных файлов (72 GiB) | |
gsutil ls -rl gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ | |
# удалим часть файлов для ускорения работы | |
gsutil ls gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ | head -283 | gsutil -m rm -I | |
gsutil ls -rl gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ # 2.49 GiB | |
# Создадим Hive external table для chicago_taxi_trips_csv | |
gcloud dataproc jobs submit hive \ | |
--cluster presto-cluster \ | |
--region=${REGION} \ | |
--execute " | |
CREATE EXTERNAL TABLE chicago_taxi_trips_csv( | |
unique_key STRING, | |
taxi_id STRING, | |
trip_start_timestamp STRING, | |
trip_end_timestamp STRING, | |
trip_seconds INT, | |
trip_miles FLOAT, | |
pickup_census_tract INT, | |
dropoff_census_tract INT, | |
pickup_community_area INT, | |
dropoff_community_area INT, | |
fare FLOAT, | |
tips FLOAT, | |
tolls FLOAT, | |
extras FLOAT, | |
trip_total FLOAT, | |
payment_type STRING, | |
company STRING, | |
pickup_latitude FLOAT, | |
pickup_longitude FLOAT, | |
pickup_location STRING, | |
dropoff_latitude FLOAT, | |
dropoff_longitude FLOAT, | |
dropoff_location STRING) | |
ROW FORMAT DELIMITED | |
FIELDS TERMINATED BY ',' | |
STORED AS TEXTFILE | |
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/csv/';" | |
# убедимся в создании таблицы и ее работе | |
gcloud dataproc jobs submit hive \ | |
--cluster presto-cluster \ | |
--region=${REGION} \ | |
--execute "SELECT COUNT(*) FROM chicago_taxi_trips_csv;" | |
# создадим таблицу в формате AVRO | |
gcloud dataproc jobs submit hive \ | |
--cluster presto-cluster \ | |
--region=${REGION} \ | |
--execute " | |
CREATE EXTERNAL TABLE chicago_taxi_trips_avro( | |
unique_key STRING, | |
taxi_id STRING, | |
trip_start_timestamp STRING, | |
trip_end_timestamp STRING, | |
trip_seconds INT, | |
trip_miles FLOAT, | |
pickup_census_tract INT, | |
dropoff_census_tract INT, | |
pickup_community_area INT, | |
dropoff_community_area INT, | |
fare FLOAT, | |
tips FLOAT, | |
tolls FLOAT, | |
extras FLOAT, | |
trip_total FLOAT, | |
payment_type STRING, | |
company STRING, | |
pickup_latitude FLOAT, | |
pickup_longitude FLOAT, | |
pickup_location STRING, | |
dropoff_latitude FLOAT, | |
dropoff_longitude FLOAT, | |
dropoff_location STRING) | |
STORED AS AVRO | |
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/avro/' | |
;" | |
gcloud dataproc jobs submit hive \ | |
--cluster presto-cluster \ | |
--region=${REGION} \ | |
--execute " | |
INSERT OVERWRITE TABLE chicago_taxi_trips_avro | |
SELECT * FROM chicago_taxi_trips_csv;" | |
# создадим таблицу в формате parquet | |
gcloud dataproc jobs submit hive \ | |
--cluster presto-cluster \ | |
--region=${REGION} \ | |
--execute " | |
CREATE EXTERNAL TABLE chicago_taxi_trips_parquet( | |
unique_key STRING, | |
taxi_id STRING, | |
trip_start_timestamp STRING, | |
trip_end_timestamp STRING, | |
trip_seconds INT, | |
trip_miles FLOAT, | |
pickup_census_tract INT, | |
dropoff_census_tract INT, | |
pickup_community_area INT, | |
dropoff_community_area INT, | |
fare FLOAT, | |
tips FLOAT, | |
tolls FLOAT, | |
extras FLOAT, | |
trip_total FLOAT, | |
payment_type STRING, | |
company STRING, | |
pickup_latitude FLOAT, | |
pickup_longitude FLOAT, | |
pickup_location STRING, | |
dropoff_latitude FLOAT, | |
dropoff_longitude FLOAT, | |
dropoff_location STRING) | |
STORED AS PARQUET | |
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/parquet/';" | |
# запишем данные в таблицу формата parquet (примерно 7 минут) | |
gcloud dataproc jobs submit hive \ | |
--cluster presto-cluster \ | |
--region=${REGION} \ | |
--execute " | |
INSERT OVERWRITE TABLE chicago_taxi_trips_parquet | |
SELECT * FROM chicago_taxi_trips_csv;" | |
# создадим таблицу в формате orc | |
gcloud dataproc jobs submit hive \ | |
--cluster presto-cluster \ | |
--region=${REGION} \ | |
--execute " | |
CREATE EXTERNAL TABLE chicago_taxi_trips_orc( | |
unique_key STRING, | |
taxi_id STRING, | |
trip_start_timestamp STRING, | |
trip_end_timestamp STRING, | |
trip_seconds INT, | |
trip_miles FLOAT, | |
pickup_census_tract INT, | |
dropoff_census_tract INT, | |
pickup_community_area INT, | |
dropoff_community_area INT, | |
fare FLOAT, | |
tips FLOAT, | |
tolls FLOAT, | |
extras FLOAT, | |
trip_total FLOAT, | |
payment_type STRING, | |
company STRING, | |
pickup_latitude FLOAT, | |
pickup_longitude FLOAT, | |
pickup_location STRING, | |
dropoff_latitude FLOAT, | |
dropoff_longitude FLOAT, | |
dropoff_location STRING) | |
STORED AS ORC | |
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/orc/' | |
tblproperties (\"orc.compress\"=\"ZLIB\",\"orc.bloom.filter.columns\"=\"payment_type,company\") | |
;" | |
gcloud dataproc jobs submit hive \ | |
--cluster presto-cluster \ | |
--region=${REGION} \ | |
--execute " | |
INSERT OVERWRITE TABLE chicago_taxi_trips_orc | |
SELECT * FROM chicago_taxi_trips_csv;" | |
# измерим размер файлов | |
gsutil du -ch gs://${BUCKET_NAME}/chicago_taxi_trips/* | |
gsutil du -sh gs://${BUCKET_NAME}/chicago_taxi_trips/csv | |
gsutil du -sh gs://${BUCKET_NAME}/chicago_taxi_trips/avro | |
gsutil du -sh gs://${BUCKET_NAME}/chicago_taxi_trips/parquet | |
gsutil du -sh gs://${BUCKET_NAME}/chicago_taxi_trips/orc | |
## интерактивный режим в Hive CLI beeline | |
# подключимся к мастер-ноде | |
gcloud compute ssh presto-cluster-m | |
beeline -u "jdbc:hive2://presto-cluster-m:10000" | |
# запустим запросы | |
!tables | |
ANALYZE TABLE chicago_taxi_trips_csv COMPUTE STATISTICS ; | |
ANALYZE TABLE chicago_taxi_trips_avro COMPUTE STATISTICS ; | |
ANALYZE TABLE chicago_taxi_trips_parquet COMPUTE STATISTICS ; | |
ANALYZE TABLE chicago_taxi_trips_orc COMPUTE STATISTICS ; | |
DESCRIBE FORMATTED chicago_taxi_trips_parquet ; | |
DESCRIBE FORMATTED chicago_taxi_trips_orc ; | |
# выйдем из beeline | |
exit | |
# orcfiledump | |
hive --orcfiledump -j -p gs://gcp-lab-278521-warehouse/chicago_taxi_trips/orc > orcfile.json | |
# на этот раз используем движок presto для sql-запросов | |
presto --catalog hive --schema default | |
show tables; | |
# запустим запросы. Эти же запросы можно запускать в Hive CLI (Beeline) | |
select count(*) from chicago_taxi_trips_csv where trip_miles > 50; | |
select count(*) from chicago_taxi_trips_avro where trip_miles > 50; | |
select count(*) from chicago_taxi_trips_parquet where trip_miles > 50; | |
select count(*) from chicago_taxi_trips_orc where trip_miles > 50; | |
select distinct payment_type, company from chicago_taxi_trips_csv limit 10; | |
select distinct payment_type, company from chicago_taxi_trips_avro limit 10; | |
select distinct payment_type, company from chicago_taxi_trips_parquet limit 10; | |
select distinct payment_type, company from chicago_taxi_trips_orc limit 10; | |
select company, payment_type, avg(fare), count(tips), sum(trip_total) | |
from chicago_taxi_trips_csv where trip_miles > 10 | |
group by 1, 2 order by 5 desc limit 10 ; | |
select company, payment_type, avg(fare), count(tips), sum(trip_total) | |
from chicago_taxi_trips_avro where trip_miles > 10 | |
group by 1, 2 order by 5 desc limit 10 ; | |
select company, payment_type, avg(fare), count(tips), sum(trip_total) | |
from chicago_taxi_trips_parquet where trip_miles > 10 | |
group by 1, 2 order by 5 desc limit 10 ; | |
select company, payment_type, avg(fare), count(tips), sum(trip_total) | |
from chicago_taxi_trips_orc where trip_miles > 10 | |
group by 1, 2 order by 5 desc limit 10 ; | |
# покинуть сессию presto | |
exit | |
## Освободить ресурсы, удалить инстансы и виртуалки | |
gcloud dataproc clusters delete presto-cluster --region=$REGION --quiet | |
gsutil -m rm -r gs://${BUCKET_NAME} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment