Skip to content

Instantly share code, notes, and snippets.

@monimiller
Created July 24, 2024 11:43
Show Gist options
  • Save monimiller/97ef76af0403776e5e14b169a1673e39 to your computer and use it in GitHub Desktop.
Save monimiller/97ef76af0403776e5e14b169a1673e39 to your computer and use it in GitHub Desktop.
ETL Mythbusters SQL
CREATE SCHEMA blue_bikes.<yourname>;
CREATE TABLE bb_bronze_temp_trips (
trip_seconds varchar,
start_time varchar,
stop_time varchar,
start_station_id varchar,
start_station_name varchar,
start_station_latitude varchar,
start_station_longitude varchar,
end_station_id varchar,
end_station_name varchar,
end_station_latitude varchar,
end_station_longitude varchar,
bike_id varchar,
user_type varchar,
postal_code varchar
) WITH (
type = 'HIVE', format = 'CSV', csv_separator = ',',
csv_quote = '"', skip_header_line_count = 1,
--EXTERNAL table as ingestion process landed data
external_location = 's3://starburst101-handsonlab-nyc-uber-rides/blue_bikes/raw_trips-2022_01-2022-09/'
);
SELECT * FROM bb_bronze_temp_trips LIMIT 10;
CREATE TABLE zip_code_income (
state char(2),
zip_code integer,
num_returns integer,
agi integer,
agi_avg decimal(16, 8),
num_returns_with_tot_inc integer,
tot_inc_amt integer,
tot_inc_avg decimal(16, 8),
num_returns_with_tax_inc integer,
tax_inc_amt integer,
tax_inc_avg decimal(16, 8)
)
WITH
(
external_location = 's3://starburst101-handsonlab-nyc-uber-rides/blue_bikes/zip_code_income/',
format = 'TEXTFILE',
textfile_field_separator = ',',
type = 'HIVE'
);
SELECT * FROM zip_code_income LIMIT 10;
SELECT min(trip_seconds), max(trip_seconds)
FROM bb_bronze_temp_trips;
SELECT CAST(trip_seconds AS int) AS trip_seconds
FROM bb_bronze_temp_trips;
SELECT
CAST(trip_seconds AS int) AS trip_seconds,
CAST(start_time AS timestamp(6)) AS start_time,
CAST(stop_time AS timestamp(6)) AS stop_time
FROM bb_bronze_temp_trips;
SELECT min(postal_code), max(postal_code)
FROM bb_bronze_temp_trips;
SELECT count()
FROM bb_bronze_temp_trips
WHERE postal_code = '';
SELECT count()
FROM bb_bronze_temp_trips
WHERE postal_code = null;
SELECT count()
FROM bb_bronze_temp_trips
WHERE postal_code = 'NULL';
SELECT nullif(replace(postal_code, 'NULL', ''), '')
FROM bb_bronze_temp_trips;
SELECT
CAST(trip_seconds AS int) AS trip_seconds,
CAST(start_time AS timestamp(6)) AS start_time,
CAST(stop_time AS timestamp(6)) AS stop_time,
CAST(start_station_id AS int) AS start_station_id,
start_station_name,
CAST(start_station_latitude AS decimal(15,13))
AS start_station_latitude,
CAST(start_station_longitude AS decimal(16,13))
AS start_station_longitude,
CAST(end_station_id AS int) AS end_station_id,
end_station_name,
CAST(end_station_latitude AS decimal(15,13))
AS end_station_latitude,
CAST(end_station_longitude AS decimal(16,13))
AS end_station_longitude,
CAST(bike_id AS int) AS bike_id,
user_type,
nullif(replace(postal_code, 'NULL', ''), '')
AS postal_code
FROM bb_bronze_temp_trips;
SELECT zip_code, state,
format_number(tot_inc_avg * 1000) AS avg_income
FROM zip_code_income
WHERE zip_code IN
(75080, 90402, 10019);
SELECT * FROM zip_code_income
WHERE zip_code IN (02139, 02124);
SELECT t.postal_code, z.tot_inc_amt
FROM bb_bronze_temp_trips AS t
JOIN zip_code_income AS z ON (
cast(t.postal_code as INT) = z.zip_code)
WHERE postal_code IN ('02139', '02124');
SELECT CAST(t.trip_seconds AS int) AS trip_seconds,
nullif(replace(t.postal_code, 'NULL', ''), '') AS postal_code,
cast(z.state AS VARCHAR(2)) as province,
z.tot_inc_avg * 1000 AS avg_income
FROM bb_bronze_temp_trips AS t
LEFT JOIN zip_code_income AS z
ON (try_cast(split_part(nullif(replace(t.postal_code, 'NULL', ''), ''), '-', 1) AS int) = z.zip_code);
CREATE TABLE bb_silver_trips
WITH (type='iceberg', format='orc') AS (
SELECT
CAST(t.trip_seconds AS int) AS trip_seconds,
CAST(start_time AS timestamp(6)) AS start_time,
CAST(stop_time AS timestamp(6)) AS stop_time,
CAST(t.start_station_id AS int) AS start_station_id, t.start_station_name,
CAST(t.start_station_latitude AS decimal(15,13)) AS start_station_latitude,
CAST(t.start_station_longitude AS decimal(16,13)) AS start_station_longitude,
CAST(t.end_station_id AS int) AS end_station_id, end_station_name,
CAST(t.end_station_latitude AS decimal(15,13)) AS end_station_latitude,
CAST(t.end_station_longitude AS decimal(16,13)) AS end_station_longitude,
CAST(t.bike_id AS int) AS bike_id, t.user_type,
nullif(replace(t.postal_code, 'NULL', ''), '') AS postal_code,
cast(z.state AS VARCHAR(2)) as province,
cast(round(z.tot_inc_avg * 1000) AS INT) AS avg_income
FROM
bb_bronze_temp_trips AS t
LEFT JOIN zip_code_income AS z ON (
try_cast(
split_part(
nullif(replace(t.postal_code, 'NULL', ''), ''), '-', 1
) AS int
) = z.zip_code
)
);
SELECT count() FROM bb_bronze_temp_trips;
SELECT count() FROM bb_silver_trips;
SHOW STATS FOR bb_silver_trips;
SELECT * FROM bb_silver_trips LIMIT 10;
SELECT * FROM "bb_silver_trips$snapshots";
SELECT * FROM "bb_silver_trips$history";
SELECT * FROM "bb_silver_trips$files";
SELECT count()
FROM bb_silver_trips
WHERE postal_code = 'NULL' or postal_code = '';
SELECT count()
FROM bb_silver_trips
WHERE postal_code is null;
CREATE VIEW
bb_gold_dur_aggs_by_strt_stat_and_usr_typ
AS (
SELECT start_station_id, user_type,
min(trip_seconds) AS min_trip_seconds,
max(trip_seconds) AS max_trip_seconds,
round(avg(trip_seconds)) AS avg_trip_seconds
FROM bb_silver_trips
GROUP BY start_station_id, user_type
);
SELECT *
FROM bb_gold_dur_aggs_by_strt_stat_and_usr_typ
ORDER BY start_station_id;
SELECT count(*)
FROM bb_gold_dur_aggs_by_strt_stat_and_usr_typ;
CREATE VIEW
bb_gold_avg_inc_by_month_and_hour
AS
WITH dates_decomposed AS (
SELECT substr(cast(date_trunc('month', start_time) AS varchar),
1, 7) AS trip_month,
substr(cast(date_trunc('hour', start_time) AS varchar),
12, 2) AS hr_of_day,
avg_income AS avg_inc_from_with
FROM bb_silver_trips
)
SELECT trip_month, hr_of_day,
round(avg(avg_inc_from_with)) AS avg_income
FROM dates_decomposed
GROUP BY trip_month, hr_of_day
;
SELECT *
FROM bb_gold_avg_inc_by_month_and_hour
WHERE trip_month = '2022-04'
ORDER BY hr_of_day;
DROP TABLE bb_bronze_temp_trips;
CREATE TABLE bb_bronze_temp_trips (
trip_seconds varchar,
start_time varchar,
stop_time varchar,
start_station_id varchar,
start_station_name varchar,
start_station_latitude varchar,
start_station_longitude varchar,
end_station_id varchar,
end_station_name varchar,
end_station_latitude varchar,
end_station_longitude varchar,
bike_id varchar,
user_type varchar,
postal_code varchar
) WITH (
type = 'HIVE', format = 'CSV', csv_separator = ',', csv_quote = '"',
skip_header_line_count = 1,
--NEW data lake location
external_location = 's3://starburst101-handsonlab-nyc-uber-rides/blue_bikes/raw_trips-2022_10/'
);
SELECT * FROM bb_bronze_temp_trips;
SELECT COUNT(*) FROM bb_bronze_temp_trips;
INSERT INTO bb_silver_trips
SELECT
CAST(t.trip_seconds AS int) AS trip_seconds,
CAST(start_time AS timestamp(6)) AS start_time,
CAST(stop_time AS timestamp(6)) AS stop_time,
CAST(t.start_station_id AS int) AS start_station_id,
t.start_station_name,
CAST(t.start_station_latitude AS decimal(15,13))
AS start_station_latitude,
CAST(t.start_station_longitude AS decimal(16,13))
AS start_station_longitude,
CAST(t.end_station_id AS int) AS end_station_id,
end_station_name,
CAST(t.end_station_latitude AS decimal(15,13))
AS end_station_latitude,
CAST(t.end_station_longitude AS decimal(16,13))
AS end_station_longitude,
CAST(t.bike_id AS int) AS bike_id, t.user_type,
nullif(replace(t.postal_code, 'NULL', ''), '')
AS postal_code,
cast(z.state AS VARCHAR(2)) as province,
cast(round(z.tot_inc_avg * 1000) AS INT) AS avg_income
FROM bb_bronze_temp_trips AS t
LEFT JOIN zip_code_income AS z ON (
try_cast(
split_part(
nullif(replace(t.postal_code, 'NULL', ''), ''), '-', 1
) AS int
) = z.zip_code
);
SELECT COUNT(*) FROM bb_silver_trips;
SELECT * FROM "bb_silver_trips$files";
SELECT count(*)
FROM bb_gold_dur_aggs_by_strt_stat_and_usr_typ;
SELECT *
FROM bb_gold_avg_inc_by_month_and_hour
WHERE trip_month = '2022-10'
ORDER BY hr_of_day;
DROP TABLE bb_bronze_temp_trips;
CREATE TABLE bb_bronze_temp_trips (
trip_seconds varchar,
start_time varchar,
stop_time varchar,
start_station_id varchar,
start_station_name varchar,
start_station_latitude varchar,
start_station_longitude varchar,
end_station_id varchar,
end_station_name varchar,
end_station_latitude varchar,
end_station_longitude varchar,
bike_id varchar,
user_type varchar,
postal_code varchar
) WITH (
type = 'HIVE', format = 'CSV', csv_separator = ',', csv_quote = '"',
skip_header_line_count = 1,
--NEW data lake location
external_location = 's3://starburst101-handsonlab-nyc-uber-rides/blue_bikes/raw_trips-2022_11/'
);
INSERT INTO bb_silver_trips
SELECT
CAST(t.trip_seconds AS int) AS trip_seconds,
CAST(start_time AS timestamp(6)) AS start_time,
CAST(stop_time AS timestamp(6)) AS stop_time,
CAST(t.start_station_id AS int) AS start_station_id,
t.start_station_name,
CAST(t.start_station_latitude AS decimal(15,13))
AS start_station_latitude,
CAST(t.start_station_longitude AS decimal(16,13))
AS start_station_longitude,
CAST(t.end_station_id AS int) AS end_station_id,
end_station_name,
CAST(t.end_station_latitude AS decimal(15,13))
AS end_station_latitude,
CAST(t.end_station_longitude AS decimal(16,13))
AS end_station_longitude,
CAST(t.bike_id AS int) AS bike_id, t.user_type,
nullif(replace(t.postal_code, 'NULL', ''), '')
AS postal_code,
cast(z.state AS VARCHAR(2)) as province,
cast(round(z.tot_inc_avg * 1000) AS INT) AS avg_income
FROM bb_bronze_temp_trips AS t
LEFT JOIN zip_code_income AS z ON (
try_cast(
split_part(
nullif(replace(t.postal_code, 'NULL', ''), ''), '-', 1
) AS int
) = z.zip_code
);
SELECT * FROM "bb_silver_trips$snapshots";
SELECT *
FROM bb_gold_dur_aggs_by_strt_stat_and_usr_typ;
SELECT *
FROM bb_gold_avg_inc_by_month_and_hour
WHERE trip_month = '2022-11'
ORDER BY hr_of_day;
SELECT * FROM "bb_silver_trips$files";
ALTER TABLE bb_silver_trips EXECUTE optimize;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment