Created
July 24, 2024 11:43
-
-
Save monimiller/97ef76af0403776e5e14b169a1673e39 to your computer and use it in GitHub Desktop.
ETL Mythbusters SQL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE SCHEMA blue_bikes.<yourname>; | |
CREATE TABLE bb_bronze_temp_trips ( | |
trip_seconds varchar, | |
start_time varchar, | |
stop_time varchar, | |
start_station_id varchar, | |
start_station_name varchar, | |
start_station_latitude varchar, | |
start_station_longitude varchar, | |
end_station_id varchar, | |
end_station_name varchar, | |
end_station_latitude varchar, | |
end_station_longitude varchar, | |
bike_id varchar, | |
user_type varchar, | |
postal_code varchar | |
) WITH ( | |
type = 'HIVE', format = 'CSV', csv_separator = ',', | |
csv_quote = '"', skip_header_line_count = 1, | |
--EXTERNAL table as ingestion process landed data | |
external_location = 's3://starburst101-handsonlab-nyc-uber-rides/blue_bikes/raw_trips-2022_01-2022-09/' | |
); | |
SELECT * FROM bb_bronze_temp_trips LIMIT 10; | |
CREATE TABLE zip_code_income ( | |
state char(2), | |
zip_code integer, | |
num_returns integer, | |
agi integer, | |
agi_avg decimal(16, 8), | |
num_returns_with_tot_inc integer, | |
tot_inc_amt integer, | |
tot_inc_avg decimal(16, 8), | |
num_returns_with_tax_inc integer, | |
tax_inc_amt integer, | |
tax_inc_avg decimal(16, 8) | |
) | |
WITH | |
( | |
external_location = 's3://starburst101-handsonlab-nyc-uber-rides/blue_bikes/zip_code_income/', | |
format = 'TEXTFILE', | |
textfile_field_separator = ',', | |
type = 'HIVE' | |
); | |
SELECT * FROM zip_code_income LIMIT 10; | |
SELECT min(trip_seconds), max(trip_seconds) | |
FROM bb_bronze_temp_trips; | |
SELECT CAST(trip_seconds AS int) AS trip_seconds | |
FROM bb_bronze_temp_trips; | |
SELECT | |
CAST(trip_seconds AS int) AS trip_seconds, | |
CAST(start_time AS timestamp(6)) AS start_time, | |
CAST(stop_time AS timestamp(6)) AS stop_time | |
FROM bb_bronze_temp_trips; | |
SELECT min(postal_code), max(postal_code) | |
FROM bb_bronze_temp_trips; | |
SELECT count() | |
FROM bb_bronze_temp_trips | |
WHERE postal_code = ''; | |
SELECT count() | |
FROM bb_bronze_temp_trips | |
WHERE postal_code = null; | |
SELECT count() | |
FROM bb_bronze_temp_trips | |
WHERE postal_code = 'NULL'; | |
SELECT nullif(replace(postal_code, 'NULL', ''), '') | |
FROM bb_bronze_temp_trips; | |
SELECT | |
CAST(trip_seconds AS int) AS trip_seconds, | |
CAST(start_time AS timestamp(6)) AS start_time, | |
CAST(stop_time AS timestamp(6)) AS stop_time, | |
CAST(start_station_id AS int) AS start_station_id, | |
start_station_name, | |
CAST(start_station_latitude AS decimal(15,13)) | |
AS start_station_latitude, | |
CAST(start_station_longitude AS decimal(16,13)) | |
AS start_station_longitude, | |
CAST(end_station_id AS int) AS end_station_id, | |
end_station_name, | |
CAST(end_station_latitude AS decimal(15,13)) | |
AS end_station_latitude, | |
CAST(end_station_longitude AS decimal(16,13)) | |
AS end_station_longitude, | |
CAST(bike_id AS int) AS bike_id, | |
user_type, | |
nullif(replace(postal_code, 'NULL', ''), '') | |
AS postal_code | |
FROM bb_bronze_temp_trips; | |
SELECT zip_code, state, | |
format_number(tot_inc_avg * 1000) AS avg_income | |
FROM zip_code_income | |
WHERE zip_code IN | |
(75080, 90402, 10019); | |
SELECT * FROM zip_code_income | |
WHERE zip_code IN (02139, 02124); | |
SELECT t.postal_code, z.tot_inc_amt | |
FROM bb_bronze_temp_trips AS t | |
JOIN zip_code_income AS z ON ( | |
cast(t.postal_code as INT) = z.zip_code) | |
WHERE postal_code IN ('02139', '02124'); | |
SELECT CAST(t.trip_seconds AS int) AS trip_seconds, | |
nullif(replace(t.postal_code, 'NULL', ''), '') AS postal_code, | |
cast(z.state AS VARCHAR(2)) as province, | |
z.tot_inc_avg * 1000 AS avg_income | |
FROM bb_bronze_temp_trips AS t | |
LEFT JOIN zip_code_income AS z | |
ON (try_cast(split_part(nullif(replace(t.postal_code, 'NULL', ''), ''), '-', 1) AS int) = z.zip_code); | |
CREATE TABLE bb_silver_trips | |
WITH (type='iceberg', format='orc') AS ( | |
SELECT | |
CAST(t.trip_seconds AS int) AS trip_seconds, | |
CAST(start_time AS timestamp(6)) AS start_time, | |
CAST(stop_time AS timestamp(6)) AS stop_time, | |
CAST(t.start_station_id AS int) AS start_station_id, t.start_station_name, | |
CAST(t.start_station_latitude AS decimal(15,13)) AS start_station_latitude, | |
CAST(t.start_station_longitude AS decimal(16,13)) AS start_station_longitude, | |
CAST(t.end_station_id AS int) AS end_station_id, end_station_name, | |
CAST(t.end_station_latitude AS decimal(15,13)) AS end_station_latitude, | |
CAST(t.end_station_longitude AS decimal(16,13)) AS end_station_longitude, | |
CAST(t.bike_id AS int) AS bike_id, t.user_type, | |
nullif(replace(t.postal_code, 'NULL', ''), '') AS postal_code, | |
cast(z.state AS VARCHAR(2)) as province, | |
cast(round(z.tot_inc_avg * 1000) AS INT) AS avg_income | |
FROM | |
bb_bronze_temp_trips AS t | |
LEFT JOIN zip_code_income AS z ON ( | |
try_cast( | |
split_part( | |
nullif(replace(t.postal_code, 'NULL', ''), ''), '-', 1 | |
) AS int | |
) = z.zip_code | |
) | |
); | |
SELECT count() FROM bb_bronze_temp_trips; | |
SELECT count() FROM bb_silver_trips; | |
SHOW STATS FOR bb_silver_trips; | |
SELECT * FROM bb_silver_trips LIMIT 10; | |
SELECT * FROM "bb_silver_trips$snapshots"; | |
SELECT * FROM "bb_silver_trips$history"; | |
SELECT * FROM "bb_silver_trips$files"; | |
SELECT count() | |
FROM bb_silver_trips | |
WHERE postal_code = 'NULL' or postal_code = ''; | |
SELECT count() | |
FROM bb_silver_trips | |
WHERE postal_code is null; | |
CREATE VIEW | |
bb_gold_dur_aggs_by_strt_stat_and_usr_typ | |
AS ( | |
SELECT start_station_id, user_type, | |
min(trip_seconds) AS min_trip_seconds, | |
max(trip_seconds) AS max_trip_seconds, | |
round(avg(trip_seconds)) AS avg_trip_seconds | |
FROM bb_silver_trips | |
GROUP BY start_station_id, user_type | |
); | |
SELECT * | |
FROM bb_gold_dur_aggs_by_strt_stat_and_usr_typ | |
ORDER BY start_station_id; | |
SELECT count(*) | |
FROM bb_gold_dur_aggs_by_strt_stat_and_usr_typ; | |
CREATE VIEW | |
bb_gold_avg_inc_by_month_and_hour | |
AS | |
WITH dates_decomposed AS ( | |
SELECT substr(cast(date_trunc('month', start_time) AS varchar), | |
1, 7) AS trip_month, | |
substr(cast(date_trunc('hour', start_time) AS varchar), | |
12, 2) AS hr_of_day, | |
avg_income AS avg_inc_from_with | |
FROM bb_silver_trips | |
) | |
SELECT trip_month, hr_of_day, | |
round(avg(avg_inc_from_with)) AS avg_income | |
FROM dates_decomposed | |
GROUP BY trip_month, hr_of_day | |
; | |
SELECT * | |
FROM bb_gold_avg_inc_by_month_and_hour | |
WHERE trip_month = '2022-04' | |
ORDER BY hr_of_day; | |
DROP TABLE bb_bronze_temp_trips; | |
CREATE TABLE bb_bronze_temp_trips ( | |
trip_seconds varchar, | |
start_time varchar, | |
stop_time varchar, | |
start_station_id varchar, | |
start_station_name varchar, | |
start_station_latitude varchar, | |
start_station_longitude varchar, | |
end_station_id varchar, | |
end_station_name varchar, | |
end_station_latitude varchar, | |
end_station_longitude varchar, | |
bike_id varchar, | |
user_type varchar, | |
postal_code varchar | |
) WITH ( | |
type = 'HIVE', format = 'CSV', csv_separator = ',', csv_quote = '"', | |
skip_header_line_count = 1, | |
--NEW data lake location | |
external_location = 's3://starburst101-handsonlab-nyc-uber-rides/blue_bikes/raw_trips-2022_10/' | |
); | |
SELECT * FROM bb_bronze_temp_trips; | |
SELECT COUNT(*) FROM bb_bronze_temp_trips; | |
INSERT INTO bb_silver_trips | |
SELECT | |
CAST(t.trip_seconds AS int) AS trip_seconds, | |
CAST(start_time AS timestamp(6)) AS start_time, | |
CAST(stop_time AS timestamp(6)) AS stop_time, | |
CAST(t.start_station_id AS int) AS start_station_id, | |
t.start_station_name, | |
CAST(t.start_station_latitude AS decimal(15,13)) | |
AS start_station_latitude, | |
CAST(t.start_station_longitude AS decimal(16,13)) | |
AS start_station_longitude, | |
CAST(t.end_station_id AS int) AS end_station_id, | |
end_station_name, | |
CAST(t.end_station_latitude AS decimal(15,13)) | |
AS end_station_latitude, | |
CAST(t.end_station_longitude AS decimal(16,13)) | |
AS end_station_longitude, | |
CAST(t.bike_id AS int) AS bike_id, t.user_type, | |
nullif(replace(t.postal_code, 'NULL', ''), '') | |
AS postal_code, | |
cast(z.state AS VARCHAR(2)) as province, | |
cast(round(z.tot_inc_avg * 1000) AS INT) AS avg_income | |
FROM bb_bronze_temp_trips AS t | |
LEFT JOIN zip_code_income AS z ON ( | |
try_cast( | |
split_part( | |
nullif(replace(t.postal_code, 'NULL', ''), ''), '-', 1 | |
) AS int | |
) = z.zip_code | |
); | |
SELECT COUNT(*) FROM bb_silver_trips; | |
SELECT * FROM "bb_silver_trips$files"; | |
SELECT count(*) | |
FROM bb_gold_dur_aggs_by_strt_stat_and_usr_typ; | |
SELECT * | |
FROM bb_gold_avg_inc_by_month_and_hour | |
WHERE trip_month = '2022-10' | |
ORDER BY hr_of_day; | |
DROP TABLE bb_bronze_temp_trips; | |
CREATE TABLE bb_bronze_temp_trips ( | |
trip_seconds varchar, | |
start_time varchar, | |
stop_time varchar, | |
start_station_id varchar, | |
start_station_name varchar, | |
start_station_latitude varchar, | |
start_station_longitude varchar, | |
end_station_id varchar, | |
end_station_name varchar, | |
end_station_latitude varchar, | |
end_station_longitude varchar, | |
bike_id varchar, | |
user_type varchar, | |
postal_code varchar | |
) WITH ( | |
type = 'HIVE', format = 'CSV', csv_separator = ',', csv_quote = '"', | |
skip_header_line_count = 1, | |
--NEW data lake location | |
external_location = 's3://starburst101-handsonlab-nyc-uber-rides/blue_bikes/raw_trips-2022_11/' | |
); | |
INSERT INTO bb_silver_trips | |
SELECT | |
CAST(t.trip_seconds AS int) AS trip_seconds, | |
CAST(start_time AS timestamp(6)) AS start_time, | |
CAST(stop_time AS timestamp(6)) AS stop_time, | |
CAST(t.start_station_id AS int) AS start_station_id, | |
t.start_station_name, | |
CAST(t.start_station_latitude AS decimal(15,13)) | |
AS start_station_latitude, | |
CAST(t.start_station_longitude AS decimal(16,13)) | |
AS start_station_longitude, | |
CAST(t.end_station_id AS int) AS end_station_id, | |
end_station_name, | |
CAST(t.end_station_latitude AS decimal(15,13)) | |
AS end_station_latitude, | |
CAST(t.end_station_longitude AS decimal(16,13)) | |
AS end_station_longitude, | |
CAST(t.bike_id AS int) AS bike_id, t.user_type, | |
nullif(replace(t.postal_code, 'NULL', ''), '') | |
AS postal_code, | |
cast(z.state AS VARCHAR(2)) as province, | |
cast(round(z.tot_inc_avg * 1000) AS INT) AS avg_income | |
FROM bb_bronze_temp_trips AS t | |
LEFT JOIN zip_code_income AS z ON ( | |
try_cast( | |
split_part( | |
nullif(replace(t.postal_code, 'NULL', ''), ''), '-', 1 | |
) AS int | |
) = z.zip_code | |
); | |
SELECT * FROM "bb_silver_trips$snapshots"; | |
SELECT * | |
FROM bb_gold_dur_aggs_by_strt_stat_and_usr_typ; | |
SELECT * | |
FROM bb_gold_avg_inc_by_month_and_hour | |
WHERE trip_month = '2022-11' | |
ORDER BY hr_of_day; | |
SELECT * FROM "bb_silver_trips$files"; | |
ALTER TABLE bb_silver_trips EXECUTE optimize; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment