Skip to content

Instantly share code, notes, and snippets.

@kzzzr
Last active March 16, 2021 19:11
Show Gist options
  • Save kzzzr/3447f5d7e316d6c05202d3d2894a5807 to your computer and use it in GitHub Desktop.
Save kzzzr/3447f5d7e316d6c05202d3d2894a5807 to your computer and use it in GitHub Desktop.
DE - File formats
## Инициализация
# устанавливаем проект
gcloud projects list
gcloud config set project my-project
# выбираем зону для кластера, например, europe-west3-a
gcloud compute regions list
gcloud compute zones list
# устанавливаем переменные среды
export REGION=europe-west3
export ZONE=europe-west3-a
export PROJECT=$(gcloud info --format='value(config.project)')
export BUCKET_NAME=${PROJECT}-warehouse
gcloud config set compute/zone $ZONE
# создаем бакет в google storage
gsutil mb -l ${REGION} gs://${BUCKET_NAME}
gsutil ls
## Создаем кластер Dataproc
gcloud dataproc clusters create presto-cluster \
--project=${PROJECT} \
--region=${REGION} \
--master-machine-type=n1-standard-2 \
--worker-machine-type=n1-standard-2 \
--num-workers=2 \
--scopes=cloud-platform \
--optional-components=PRESTO \
--image-version=1.5 \
--enable-component-gateway
## Выгрузим датасет из BigQuery - chicago_taxi_trips
bq extract \
--location=us \
--destination_format=CSV \
--field_delimiter=',' \
--print_header=false \
"bigquery-public-data:chicago_taxi_trips.taxi_trips" \
gs://${BUCKET_NAME}/chicago_taxi_trips/csv/shard-*.csv
# посмотрим на список выгруженных файлов (72 GiB)
gsutil ls -rl gs://${BUCKET_NAME}/chicago_taxi_trips/csv/
# удалим часть файлов для ускорения работы
gsutil ls gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ | head -283 | gsutil -m rm -I
gsutil ls -rl gs://${BUCKET_NAME}/chicago_taxi_trips/csv/ # 2.49 GiB
# Создадим Hive external table для chicago_taxi_trips_csv
gcloud dataproc jobs submit hive \
--cluster presto-cluster \
--region=${REGION} \
--execute "
CREATE EXTERNAL TABLE chicago_taxi_trips_csv(
unique_key STRING,
taxi_id STRING,
trip_start_timestamp STRING,
trip_end_timestamp STRING,
trip_seconds INT,
trip_miles FLOAT,
pickup_census_tract INT,
dropoff_census_tract INT,
pickup_community_area INT,
dropoff_community_area INT,
fare FLOAT,
tips FLOAT,
tolls FLOAT,
extras FLOAT,
trip_total FLOAT,
payment_type STRING,
company STRING,
pickup_latitude FLOAT,
pickup_longitude FLOAT,
pickup_location STRING,
dropoff_latitude FLOAT,
dropoff_longitude FLOAT,
dropoff_location STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/csv/';"
# убедимся в создании таблицы и ее работе
gcloud dataproc jobs submit hive \
--cluster presto-cluster \
--region=${REGION} \
--execute "SELECT COUNT(*) FROM chicago_taxi_trips_csv;"
# создадим таблицу в формате AVRO
gcloud dataproc jobs submit hive \
--cluster presto-cluster \
--region=${REGION} \
--execute "
CREATE EXTERNAL TABLE chicago_taxi_trips_avro(
unique_key STRING,
taxi_id STRING,
trip_start_timestamp STRING,
trip_end_timestamp STRING,
trip_seconds INT,
trip_miles FLOAT,
pickup_census_tract INT,
dropoff_census_tract INT,
pickup_community_area INT,
dropoff_community_area INT,
fare FLOAT,
tips FLOAT,
tolls FLOAT,
extras FLOAT,
trip_total FLOAT,
payment_type STRING,
company STRING,
pickup_latitude FLOAT,
pickup_longitude FLOAT,
pickup_location STRING,
dropoff_latitude FLOAT,
dropoff_longitude FLOAT,
dropoff_location STRING)
STORED AS AVRO
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/avro/'
;"
gcloud dataproc jobs submit hive \
--cluster presto-cluster \
--region=${REGION} \
--execute "
INSERT OVERWRITE TABLE chicago_taxi_trips_avro
SELECT * FROM chicago_taxi_trips_csv;"
# создадим таблицу в формате parquet
gcloud dataproc jobs submit hive \
--cluster presto-cluster \
--region=${REGION} \
--execute "
CREATE EXTERNAL TABLE chicago_taxi_trips_parquet(
unique_key STRING,
taxi_id STRING,
trip_start_timestamp STRING,
trip_end_timestamp STRING,
trip_seconds INT,
trip_miles FLOAT,
pickup_census_tract INT,
dropoff_census_tract INT,
pickup_community_area INT,
dropoff_community_area INT,
fare FLOAT,
tips FLOAT,
tolls FLOAT,
extras FLOAT,
trip_total FLOAT,
payment_type STRING,
company STRING,
pickup_latitude FLOAT,
pickup_longitude FLOAT,
pickup_location STRING,
dropoff_latitude FLOAT,
dropoff_longitude FLOAT,
dropoff_location STRING)
STORED AS PARQUET
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/parquet/';"
# запишем данные в таблицу формата parquet (примерно 7 минут)
gcloud dataproc jobs submit hive \
--cluster presto-cluster \
--region=${REGION} \
--execute "
INSERT OVERWRITE TABLE chicago_taxi_trips_parquet
SELECT * FROM chicago_taxi_trips_csv;"
# создадим таблицу в формате orc
gcloud dataproc jobs submit hive \
--cluster presto-cluster \
--region=${REGION} \
--execute "
CREATE EXTERNAL TABLE chicago_taxi_trips_orc(
unique_key STRING,
taxi_id STRING,
trip_start_timestamp STRING,
trip_end_timestamp STRING,
trip_seconds INT,
trip_miles FLOAT,
pickup_census_tract INT,
dropoff_census_tract INT,
pickup_community_area INT,
dropoff_community_area INT,
fare FLOAT,
tips FLOAT,
tolls FLOAT,
extras FLOAT,
trip_total FLOAT,
payment_type STRING,
company STRING,
pickup_latitude FLOAT,
pickup_longitude FLOAT,
pickup_location STRING,
dropoff_latitude FLOAT,
dropoff_longitude FLOAT,
dropoff_location STRING)
STORED AS ORC
location 'gs://${BUCKET_NAME}/chicago_taxi_trips/orc/'
tblproperties (\"orc.compress\"=\"ZLIB\",\"orc.bloom.filter.columns\"=\"payment_type,company\")
;"
gcloud dataproc jobs submit hive \
--cluster presto-cluster \
--region=${REGION} \
--execute "
INSERT OVERWRITE TABLE chicago_taxi_trips_orc
SELECT * FROM chicago_taxi_trips_csv;"
# измерим размер файлов
gsutil du -ch gs://${BUCKET_NAME}/chicago_taxi_trips/*
gsutil du -sh gs://${BUCKET_NAME}/chicago_taxi_trips/csv
gsutil du -sh gs://${BUCKET_NAME}/chicago_taxi_trips/avro
gsutil du -sh gs://${BUCKET_NAME}/chicago_taxi_trips/parquet
gsutil du -sh gs://${BUCKET_NAME}/chicago_taxi_trips/orc
## интерактивный режим в Hive CLI beeline
# подключимся к мастер-ноде
gcloud compute ssh presto-cluster-m
beeline -u "jdbc:hive2://presto-cluster-m:10000"
# запустим запросы
!tables
ANALYZE TABLE chicago_taxi_trips_csv COMPUTE STATISTICS ;
ANALYZE TABLE chicago_taxi_trips_avro COMPUTE STATISTICS ;
ANALYZE TABLE chicago_taxi_trips_parquet COMPUTE STATISTICS ;
ANALYZE TABLE chicago_taxi_trips_orc COMPUTE STATISTICS ;
DESCRIBE FORMATTED chicago_taxi_trips_parquet ;
DESCRIBE FORMATTED chicago_taxi_trips_orc ;
# выйдем из beeline
exit
# orcfiledump
hive --orcfiledump -j -p gs://gcp-lab-278521-warehouse/chicago_taxi_trips/orc > orcfile.json
# на этот раз используем движок presto для sql-запросов
presto --catalog hive --schema default
show tables;
# запустим запросы. Эти же запросы можно запускать в Hive CLI (Beeline)
select count(*) from chicago_taxi_trips_csv where trip_miles > 50;
select count(*) from chicago_taxi_trips_avro where trip_miles > 50;
select count(*) from chicago_taxi_trips_parquet where trip_miles > 50;
select count(*) from chicago_taxi_trips_orc where trip_miles > 50;
select distinct payment_type, company from chicago_taxi_trips_csv limit 10;
select distinct payment_type, company from chicago_taxi_trips_avro limit 10;
select distinct payment_type, company from chicago_taxi_trips_parquet limit 10;
select distinct payment_type, company from chicago_taxi_trips_orc limit 10;
select company, payment_type, avg(fare), count(tips), sum(trip_total)
from chicago_taxi_trips_csv where trip_miles > 10
group by 1, 2 order by 5 desc limit 10 ;
select company, payment_type, avg(fare), count(tips), sum(trip_total)
from chicago_taxi_trips_avro where trip_miles > 10
group by 1, 2 order by 5 desc limit 10 ;
select company, payment_type, avg(fare), count(tips), sum(trip_total)
from chicago_taxi_trips_parquet where trip_miles > 10
group by 1, 2 order by 5 desc limit 10 ;
select company, payment_type, avg(fare), count(tips), sum(trip_total)
from chicago_taxi_trips_orc where trip_miles > 10
group by 1, 2 order by 5 desc limit 10 ;
# покинуть сессию presto
exit
## Освободить ресурсы, удалить инстансы и виртуалки
gcloud dataproc clusters delete presto-cluster --region=$REGION --quiet
gsutil -m rm -r gs://${BUCKET_NAME}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment